retrieve data from unicode.org in data.py

main
pantonshire 3 years ago
parent 6b911caa15
commit ecf2abbdad

@ -52,6 +52,12 @@ from enum import Enum
from struct import pack
from typing import Optional
from gzip import compress
from time import time
import http.client
unicode_data_host = 'www.unicode.org'
unicode_data_url_path = '/Public/UCD/latest/ucd/UnicodeData.txt'
out_data_path = 'unicode_data_encoded.gz'
class StringTableIndex:
def __init__(self, bs: bytes):
@ -247,19 +253,31 @@ def encode_char_data(
return bytes(encoded)
with open('unicode_data_latest.txt', 'r') as fd:
input_data = fd.read()
print('Fetching Unicode data from {}...'.format(unicode_data_host))
start_time = time()
conn = http.client.HTTPConnection(unicode_data_host, timeout=30)
conn.request('GET', unicode_data_url_path)
resp = conn.getresponse()
resp_data = resp.read()
input_data = resp_data.decode('utf-8')
end_time = time()
print('Fetched Unicode data in {:.2f}s'.format(end_time - start_time))
char_data_table = bytearray()
string_table = StringTable()
# FIXME: calculate cumulative offset required for codepoints after each group
groups = []
in_group = False
prev_code = None
uniq_vals = {}
for row in input_data.splitlines():
rows = [row.strip() for row in input_data.splitlines() if len(row.strip()) > 0]
print('Encoding {} rows...'.format(len(rows)))
for row in rows:
is_group_start = False
[
@ -425,5 +443,9 @@ encoded_data.extend(string_table)
compressed_data = compress(encoded_data)
with open('unicode_data_encoded.gz', 'wb') as fd:
print('Writing encoded data to {}...'.format(out_data_path))
with open(out_data_path, 'wb') as fd:
fd.write(compressed_data)
print('Done!')

@ -1,2 +0,0 @@
#!/bin/bash
curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump/unicode_data_latest.txt

Binary file not shown.
Loading…
Cancel
Save