retrieve data from unicode.org in data.py

main
pantonshire 3 years ago
parent 6b911caa15
commit ecf2abbdad

@ -52,6 +52,12 @@ from enum import Enum
from struct import pack from struct import pack
from typing import Optional from typing import Optional
from gzip import compress from gzip import compress
from time import time
import http.client
unicode_data_host = 'www.unicode.org'
unicode_data_url_path = '/Public/UCD/latest/ucd/UnicodeData.txt'
out_data_path = 'unicode_data_encoded.gz'
class StringTableIndex: class StringTableIndex:
def __init__(self, bs: bytes): def __init__(self, bs: bytes):
@ -247,19 +253,31 @@ def encode_char_data(
return bytes(encoded) return bytes(encoded)
with open('unicode_data_latest.txt', 'r') as fd: print('Fetching Unicode data from {}...'.format(unicode_data_host))
input_data = fd.read() start_time = time()
conn = http.client.HTTPConnection(unicode_data_host, timeout=30)
conn.request('GET', unicode_data_url_path)
resp = conn.getresponse()
resp_data = resp.read()
input_data = resp_data.decode('utf-8')
end_time = time()
print('Fetched Unicode data in {:.2f}s'.format(end_time - start_time))
char_data_table = bytearray() char_data_table = bytearray()
string_table = StringTable() string_table = StringTable()
# FIXME: calculate cumulative offset required for codepoints after each group
groups = [] groups = []
in_group = False in_group = False
prev_code = None prev_code = None
uniq_vals = {} uniq_vals = {}
for row in input_data.splitlines(): rows = [row.strip() for row in input_data.splitlines() if len(row.strip()) > 0]
print('Encoding {} rows...'.format(len(rows)))
for row in rows:
is_group_start = False is_group_start = False
[ [
@ -425,5 +443,9 @@ encoded_data.extend(string_table)
compressed_data = compress(encoded_data) compressed_data = compress(encoded_data)
with open('unicode_data_encoded.gz', 'wb') as fd: print('Writing encoded data to {}...'.format(out_data_path))
with open(out_data_path, 'wb') as fd:
fd.write(compressed_data) fd.write(compressed_data)
print('Done!')

@ -1,2 +0,0 @@
#!/bin/bash
curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump/unicode_data_latest.txt

Binary file not shown.
Loading…
Cancel
Save