From ecf2abbdadd0b416a0e653219b590e9e41e53f4b Mon Sep 17 00:00:00 2001 From: pantonshire Date: Mon, 5 Jun 2023 10:57:19 +0100 Subject: [PATCH] retrieve data from unicode.org in data.py --- data.py | 32 +++++++++++++++++++++++++++----- get_data.sh | 2 -- unicode_data_encoded.gz | Bin 323382 -> 323382 bytes 3 files changed, 27 insertions(+), 7 deletions(-) delete mode 100755 get_data.sh diff --git a/data.py b/data.py index 527fd14..8032ecb 100644 --- a/data.py +++ b/data.py @@ -52,6 +52,12 @@ from enum import Enum from struct import pack from typing import Optional from gzip import compress +from time import time +import http.client + +unicode_data_host = 'www.unicode.org' +unicode_data_url_path = '/Public/UCD/latest/ucd/UnicodeData.txt' +out_data_path = 'unicode_data_encoded.gz' class StringTableIndex: def __init__(self, bs: bytes): @@ -247,19 +253,31 @@ def encode_char_data( return bytes(encoded) -with open('unicode_data_latest.txt', 'r') as fd: - input_data = fd.read() +print('Fetching Unicode data from {}...'.format(unicode_data_host)) +start_time = time() + +conn = http.client.HTTPConnection(unicode_data_host, timeout=30) +conn.request('GET', unicode_data_url_path) +resp = conn.getresponse() +resp_data = resp.read() +input_data = resp_data.decode('utf-8') + +end_time = time() +print('Fetched Unicode data in {:.2f}s'.format(end_time - start_time)) char_data_table = bytearray() string_table = StringTable() -# FIXME: calculate cumulative offset required for codepoints after each group groups = [] in_group = False prev_code = None uniq_vals = {} -for row in input_data.splitlines(): +rows = [row.strip() for row in input_data.splitlines() if len(row.strip()) > 0] + +print('Encoding {} rows...'.format(len(rows))) + +for row in rows: is_group_start = False [ @@ -425,5 +443,9 @@ encoded_data.extend(string_table) compressed_data = compress(encoded_data) -with open('unicode_data_encoded.gz', 'wb') as fd: +print('Writing encoded data to {}...'.format(out_data_path)) + +with open(out_data_path, 'wb') as fd: fd.write(compressed_data) + +print('Done!') diff --git a/get_data.sh b/get_data.sh deleted file mode 100755 index 0254d53..0000000 --- a/get_data.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump/unicode_data_latest.txt diff --git a/unicode_data_encoded.gz b/unicode_data_encoded.gz index d110b3b9b1cff7cccbb4d9f3a7214009f6dc349e..100b05ce3b97d1f921815001047ebe02bf29a5cd 100644 GIT binary patch delta 30 mcmdmXPk7rsVRrd$4u%CAY8%;G*%@2enOfPIx3aV7zXt%UdkN(L delta 30 mcmdmXPk7rsVRrd$4hHUh6^-nz?2N7KOs(w9TiIFk-va=m6A4)W