From ecf2abbdadd0b416a0e653219b590e9e41e53f4b Mon Sep 17 00:00:00 2001
From: pantonshire <tom@tomandtally.co.uk>
Date: Mon, 5 Jun 2023 10:57:19 +0100
Subject: [PATCH] retrieve data from unicode.org in data.py

---
 data.py                 |  32 +++++++++++++++++++++++++++-----
 get_data.sh             |   2 --
 unicode_data_encoded.gz | Bin 323382 -> 323382 bytes
 3 files changed, 27 insertions(+), 7 deletions(-)
 delete mode 100755 get_data.sh

diff --git a/data.py b/data.py
index 527fd14..8032ecb 100644
--- a/data.py
+++ b/data.py
@@ -52,6 +52,12 @@ from enum import Enum
 from struct import pack
 from typing import Optional
 from gzip import compress
+from time import time
+import http.client
+
+unicode_data_host = 'www.unicode.org'
+unicode_data_url_path = '/Public/UCD/latest/ucd/UnicodeData.txt'
+out_data_path = 'unicode_data_encoded.gz'
 
 class StringTableIndex:
     def __init__(self, bs: bytes):
@@ -247,19 +253,31 @@ def encode_char_data(
 
     return bytes(encoded)
 
-with open('unicode_data_latest.txt', 'r') as fd:
-    input_data = fd.read()
+print('Fetching Unicode data from {}...'.format(unicode_data_host))
+start_time = time()
+
+conn = http.client.HTTPConnection(unicode_data_host, timeout=30)
+conn.request('GET', unicode_data_url_path)
+resp = conn.getresponse()
+resp_data = resp.read()
+input_data = resp_data.decode('utf-8')
+
+end_time = time()
+print('Fetched Unicode data in {:.2f}s'.format(end_time - start_time))
 
 char_data_table = bytearray()
 string_table = StringTable()
-# FIXME: calculate cumulative offset required for codepoints after each group
 groups = []
 in_group = False
 prev_code = None
 
 uniq_vals = {}
 
-for row in input_data.splitlines():    
+rows = [row.strip() for row in input_data.splitlines() if len(row.strip()) > 0]
+
+print('Encoding {} rows...'.format(len(rows)))
+
+for row in rows:    
     is_group_start = False
 
     [
@@ -425,5 +443,9 @@ encoded_data.extend(string_table)
 
 compressed_data = compress(encoded_data)
 
-with open('unicode_data_encoded.gz', 'wb') as fd:
+print('Writing encoded data to {}...'.format(out_data_path))
+
+with open(out_data_path, 'wb') as fd:
     fd.write(compressed_data)
+
+print('Done!')
diff --git a/get_data.sh b/get_data.sh
deleted file mode 100755
index 0254d53..0000000
--- a/get_data.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/bash
-curl --proto '=https' --tlsv1.2 'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt' > utfdump/unicode_data_latest.txt
diff --git a/unicode_data_encoded.gz b/unicode_data_encoded.gz
index d110b3b9b1cff7cccbb4d9f3a7214009f6dc349e..100b05ce3b97d1f921815001047ebe02bf29a5cd 100644
GIT binary patch
delta 30
mcmdmXPk7rsVRrd$4u%CAY8%;G*%@2enOfPIx3aV7zXt%UdkN(L

delta 30
mcmdmXPk7rsVRrd$4hHUh6^-nz?2N7KOs(w9TiIFk-va=m6A4)W