aboutsummaryrefslogtreecommitdiffstats
path: root/library/cpp/tld/gen_tld.py
diff options
context:
space:
mode:
authorNikita Kozlovskiy <nikitka@gmail.com>2023-06-28 14:27:42 +0000
committernkozlovskiy <nmk@ydb.tech>2023-06-28 17:27:42 +0300
commit61de4a67fa045e1281fb392df8cbfbcd890bc104 (patch)
tree45c7b314f4569fb2332f741e5f0d5caebbb9f54b /library/cpp/tld/gen_tld.py
parentf41f90eacc1ab531dbc05cf10ced62bb31d333be (diff)
downloadydb-61de4a67fa045e1281fb392df8cbfbcd890bc104.tar.gz
fix gen_tld.py on non-unicode locale encodings
fix gen_tld.py on non-unicode locale encodings In python versions prior to 3.7, as well as when using non-unicode locale encodings, the `gen_tld.py` script fails: ``` ydb/ydb/library/cpp/tld % git rev-parse head 97b1a695d3be4edc08550d3ae7d200f6d9f3d42e ydb/ydb/library/cpp/tld % LC_CTYPE=C ~/.pyenv/versions/3.6.15/bin/python gen_tld.py tlds-alpha-by-domain.txt|md5 Traceback (most recent call last): File "gen_tld.py", line 57, in <module> main() File "gen_tld.py", line 39, in main sys.stdout.write('%s*/\n' % str.rstrip()) UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-2: ordinal not in range(128) ``` This pull request fixes this behevaiour by explicit set output encoding to utf-8. To ensure that I do not break anything, I checked the MD5 hash of the generated file before and after making my changes: ``` ydb/ydb/library/cpp/tld % git rev-parse head 97b1a695d3be4edc08550d3ae7d200f6d9f3d42e ydb/ydb/library/cpp/tld % python3 gen_tld.py tlds-alpha-by-domain.txt|md5 564242d355d842db790977df3642a405 ``` After ``` ydb/ydb/library/cpp/tld % git rev-parse head 1096dd7f034c573aabdf3bac2dc4b181a6688c71 ydb/ydb/library/cpp/tld % python3 gen_tld.py tlds-alpha-by-domain.txt|md5 564242d355d842db790977df3642a405 ydb/ydb/library/cpp/tld % LC_CTYPE=C ~/.pyenv/versions/3.6.15/bin/python gen_tld.py tlds-alpha-by-domain.txt|md5 564242d355d842db790977df3642a405 ``` Pull Request resolved: #279
Diffstat (limited to 'library/cpp/tld/gen_tld.py')
-rwxr-xr-xlibrary/cpp/tld/gen_tld.py25
1 files changed, 14 insertions, 11 deletions
diff --git a/library/cpp/tld/gen_tld.py b/library/cpp/tld/gen_tld.py
index 48a8f815d9..b7fa5a1e89 100755
--- a/library/cpp/tld/gen_tld.py
+++ b/library/cpp/tld/gen_tld.py
@@ -1,7 +1,8 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-import sys, os
+import sys
+
def main():
alphabet = 'abcdefghijklmnopqrstuvwxyz'
@@ -26,32 +27,34 @@ def main():
tlds[label].append(domain)
break
- print('// actual list can be found at http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
- print('static const char* const TopLevelDomains[] = {')
+ stdout = open(sys.stdout.fileno(), "w", encoding="utf-8", closefd=False)
+
+ stdout.write('// actual list can be found at http://data.iana.org/TLD/tlds-alpha-by-domain.txt\n')
+ stdout.write('static const char* const TopLevelDomains[] = {\n')
for label, value in sorted(tlds.items()):
if label == 'xn--':
- sys.stdout.write(' /* ')
+ stdout.write(' /* ')
str = ''
for n in value:
unicode_domain = n.decode('idna')
str += ('%s, ' % unicode_domain)
- sys.stdout.write('%s*/\n' % str.rstrip())
+ stdout.write('%s*/\n' % str.rstrip())
- sys.stdout.write(' ')
+ stdout.write(' ')
str = ''
for n in value:
str += ('"%s", ' % n.decode('utf-8'))
- sys.stdout.write('%s\n' % str.rstrip())
+ stdout.write('%s\n' % str.rstrip())
else:
- sys.stdout.write(' ')
+ stdout.write(' ')
str = ''
for n in value:
str += ('"%s", ' % n.decode('utf-8'))
- sys.stdout.write('%s\n' % str.rstrip())
+ stdout.write('%s\n' % str.rstrip())
- print(' 0')
- print('};')
+ stdout.write(' 0\n')
+ stdout.write('};\n')
if __name__ == '__main__':
main()