diff --git a/data.py b/data.py index 527fd14..8032ecb 100644 --- a/data.py +++ b/data.py @@ -52,6 +52,12 @@ from enum import Enum from struct import pack from typing import Optional from gzip import compress +from time import time +import http.client + +unicode_data_host = 'www.unicode.org' +unicode_data_url_path = '/Public/UCD/latest/ucd/UnicodeData.txt' +out_data_path = 'unicode_data_encoded.gz' class StringTableIndex: def __init__(self, bs: bytes): @@ -247,19 +253,31 @@ def encode_char_data( return bytes(encoded) -with open('unicode_data_latest.txt', 'r') as fd: - input_data = fd.read() +print('Fetching Unicode data from {}...'.format(unicode_data_host)) +start_time = time() + +conn = http.client.HTTPConnection(unicode_data_host, timeout=30) +conn.request('GET', unicode_data_url_path) +resp = conn.getresponse() +resp_data = resp.read() +input_data = resp_data.decode('utf-8') + +end_time = time() +print('Fetched Unicode data in {:.2f}s'.format(end_time - start_time)) char_data_table = bytearray() string_table = StringTable() -# FIXME: calculate cumulative offset required for codepoints after each group groups = [] in_group = False prev_code = None uniq_vals = {} -for row in input_data.splitlines(): +rows = [row.strip() for row in input_data.splitlines() if len(row.strip()) > 0] + +print('Encoding {} rows...'.format(len(rows))) + +for row in rows: is_group_start = False [ @@ -425,5 +443,9 @@ encoded_data.extend(string_table) compressed_data = compress(encoded_data) -with open('unicode_data_encoded.gz', 'wb') as fd: +print('Writing encoded data to {}...'.format(out_data_path)) + +with open(out_data_path, 'wb') as fd: fd.write(compressed_data) + +print('Done!') diff --git a/get_data.sh b/get_data.sh deleted file mode 100755 index 0254d53..0000000 --- a/get_data.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump/unicode_data_latest.txt diff --git a/unicode_data_encoded.gz b/unicode_data_encoded.gz index d110b3b..100b05c 100644 Binary files a/unicode_data_encoded.gz and b/unicode_data_encoded.gz differ