aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/tools/python3/Lib/encodings/idna.py
diff options
context:
space:
mode:
authorAlexSm <alex@ydb.tech>2024-03-05 10:40:59 +0100
committerGitHub <noreply@github.com>2024-03-05 12:40:59 +0300
commit1ac13c847b5358faba44dbb638a828e24369467b (patch)
tree07672b4dd3604ad3dee540a02c6494cb7d10dc3d /contrib/tools/python3/Lib/encodings/idna.py
parentffcca3e7f7958ddc6487b91d3df8c01054bd0638 (diff)
downloadydb-1ac13c847b5358faba44dbb638a828e24369467b.tar.gz
Library import 16 (#2433)
Co-authored-by: robot-piglet <robot-piglet@yandex-team.com> Co-authored-by: deshevoy <deshevoy@yandex-team.com> Co-authored-by: robot-contrib <robot-contrib@yandex-team.com> Co-authored-by: thegeorg <thegeorg@yandex-team.com> Co-authored-by: robot-ya-builder <robot-ya-builder@yandex-team.com> Co-authored-by: svidyuk <svidyuk@yandex-team.com> Co-authored-by: shadchin <shadchin@yandex-team.com> Co-authored-by: robot-ratatosk <robot-ratatosk@yandex-team.com> Co-authored-by: innokentii <innokentii@yandex-team.com> Co-authored-by: arkady-e1ppa <arkady-e1ppa@yandex-team.com> Co-authored-by: snermolaev <snermolaev@yandex-team.com> Co-authored-by: dimdim11 <dimdim11@yandex-team.com> Co-authored-by: kickbutt <kickbutt@yandex-team.com> Co-authored-by: abdullinsaid <abdullinsaid@yandex-team.com> Co-authored-by: korsunandrei <korsunandrei@yandex-team.com> Co-authored-by: petrk <petrk@yandex-team.com> Co-authored-by: miroslav2 <miroslav2@yandex-team.com> Co-authored-by: serjflint <serjflint@yandex-team.com> Co-authored-by: akhropov <akhropov@yandex-team.com> Co-authored-by: prettyboy <prettyboy@yandex-team.com> Co-authored-by: ilikepugs <ilikepugs@yandex-team.com> Co-authored-by: hiddenpath <hiddenpath@yandex-team.com> Co-authored-by: mikhnenko <mikhnenko@yandex-team.com> Co-authored-by: spreis <spreis@yandex-team.com> Co-authored-by: andreyshspb <andreyshspb@yandex-team.com> Co-authored-by: dimaandreev <dimaandreev@yandex-team.com> Co-authored-by: rashid <rashid@yandex-team.com> Co-authored-by: robot-ydb-importer <robot-ydb-importer@yandex-team.com> Co-authored-by: r-vetrov <r-vetrov@yandex-team.com> Co-authored-by: ypodlesov <ypodlesov@yandex-team.com> Co-authored-by: zaverden <zaverden@yandex-team.com> Co-authored-by: vpozdyayev <vpozdyayev@yandex-team.com> Co-authored-by: robot-cozmo <robot-cozmo@yandex-team.com> Co-authored-by: v-korovin <v-korovin@yandex-team.com> Co-authored-by: arikon <arikon@yandex-team.com> Co-authored-by: khoden <khoden@yandex-team.com> Co-authored-by: psydmm <psydmm@yandex-team.com> Co-authored-by: robot-javacom <robot-javacom@yandex-team.com> Co-authored-by: dtorilov <dtorilov@yandex-team.com> Co-authored-by: sennikovmv <sennikovmv@yandex-team.com> Co-authored-by: hcpp <hcpp@ydb.tech>
Diffstat (limited to 'contrib/tools/python3/Lib/encodings/idna.py')
-rw-r--r--contrib/tools/python3/Lib/encodings/idna.py317
1 files changed, 317 insertions, 0 deletions
diff --git a/contrib/tools/python3/Lib/encodings/idna.py b/contrib/tools/python3/Lib/encodings/idna.py
new file mode 100644
index 0000000000..5396047a7f
--- /dev/null
+++ b/contrib/tools/python3/Lib/encodings/idna.py
@@ -0,0 +1,317 @@
+# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
+
+import stringprep, re, codecs
+from unicodedata import ucd_3_2_0 as unicodedata
+
+# IDNA section 3.1
+dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
+
+# IDNA section 5
+ace_prefix = b"xn--"
+sace_prefix = "xn--"
+
+# This assumes query strings, so AllowUnassigned is true
+def nameprep(label):
+ # Map
+ newlabel = []
+ for c in label:
+ if stringprep.in_table_b1(c):
+ # Map to nothing
+ continue
+ newlabel.append(stringprep.map_table_b2(c))
+ label = "".join(newlabel)
+
+ # Normalize
+ label = unicodedata.normalize("NFKC", label)
+
+ # Prohibit
+ for c in label:
+ if stringprep.in_table_c12(c) or \
+ stringprep.in_table_c22(c) or \
+ stringprep.in_table_c3(c) or \
+ stringprep.in_table_c4(c) or \
+ stringprep.in_table_c5(c) or \
+ stringprep.in_table_c6(c) or \
+ stringprep.in_table_c7(c) or \
+ stringprep.in_table_c8(c) or \
+ stringprep.in_table_c9(c):
+ raise UnicodeError("Invalid character %r" % c)
+
+ # Check bidi
+ RandAL = [stringprep.in_table_d1(x) for x in label]
+ if any(RandAL):
+ # There is a RandAL char in the string. Must perform further
+ # tests:
+ # 1) The characters in section 5.8 MUST be prohibited.
+ # This is table C.8, which was already checked
+ # 2) If a string contains any RandALCat character, the string
+ # MUST NOT contain any LCat character.
+ if any(stringprep.in_table_d2(x) for x in label):
+ raise UnicodeError("Violation of BIDI requirement 2")
+ # 3) If a string contains any RandALCat character, a
+ # RandALCat character MUST be the first character of the
+ # string, and a RandALCat character MUST be the last
+ # character of the string.
+ if not RandAL[0] or not RandAL[-1]:
+ raise UnicodeError("Violation of BIDI requirement 3")
+
+ return label
+
+def ToASCII(label):
+ try:
+ # Step 1: try ASCII
+ label = label.encode("ascii")
+ except UnicodeError:
+ pass
+ else:
+ # Skip to step 3: UseSTD3ASCIIRules is false, so
+ # Skip to step 8.
+ if 0 < len(label) < 64:
+ return label
+ raise UnicodeError("label empty or too long")
+
+ # Step 2: nameprep
+ label = nameprep(label)
+
+ # Step 3: UseSTD3ASCIIRules is false
+ # Step 4: try ASCII
+ try:
+ label = label.encode("ascii")
+ except UnicodeError:
+ pass
+ else:
+ # Skip to step 8.
+ if 0 < len(label) < 64:
+ return label
+ raise UnicodeError("label empty or too long")
+
+ # Step 5: Check ACE prefix
+ if label.startswith(sace_prefix):
+ raise UnicodeError("Label starts with ACE prefix")
+
+ # Step 6: Encode with PUNYCODE
+ label = label.encode("punycode")
+
+ # Step 7: Prepend ACE prefix
+ label = ace_prefix + label
+
+ # Step 8: Check size
+ if 0 < len(label) < 64:
+ return label
+ raise UnicodeError("label empty or too long")
+
+def ToUnicode(label):
+ if len(label) > 1024:
+ # Protection from https://github.com/python/cpython/issues/98433.
+ # https://datatracker.ietf.org/doc/html/rfc5894#section-6
+ # doesn't specify a label size limit prior to NAMEPREP. But having
+ # one makes practical sense.
+ # This leaves ample room for nameprep() to remove Nothing characters
+ # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
+ # preventing us from wasting time decoding a big thing that'll just
+ # hit the actual <= 63 length limit in Step 6.
+ raise UnicodeError("label way too long")
+ # Step 1: Check for ASCII
+ if isinstance(label, bytes):
+ pure_ascii = True
+ else:
+ try:
+ label = label.encode("ascii")
+ pure_ascii = True
+ except UnicodeError:
+ pure_ascii = False
+ if not pure_ascii:
+ # Step 2: Perform nameprep
+ label = nameprep(label)
+ # It doesn't say this, but apparently, it should be ASCII now
+ try:
+ label = label.encode("ascii")
+ except UnicodeError:
+ raise UnicodeError("Invalid character in IDN label")
+ # Step 3: Check for ACE prefix
+ if not label.startswith(ace_prefix):
+ return str(label, "ascii")
+
+ # Step 4: Remove ACE prefix
+ label1 = label[len(ace_prefix):]
+
+ # Step 5: Decode using PUNYCODE
+ result = label1.decode("punycode")
+
+ # Step 6: Apply ToASCII
+ label2 = ToASCII(result)
+
+ # Step 7: Compare the result of step 6 with the one of step 3
+ # label2 will already be in lower case.
+ if str(label, "ascii").lower() != str(label2, "ascii"):
+ raise UnicodeError("IDNA does not round-trip", label, label2)
+
+ # Step 8: return the result of step 5
+ return result
+
+### Codec APIs
+
+class Codec(codecs.Codec):
+ def encode(self, input, errors='strict'):
+
+ if errors != 'strict':
+ # IDNA is quite clear that implementations must be strict
+ raise UnicodeError("unsupported error handling "+errors)
+
+ if not input:
+ return b'', 0
+
+ try:
+ result = input.encode('ascii')
+ except UnicodeEncodeError:
+ pass
+ else:
+ # ASCII name: fast path
+ labels = result.split(b'.')
+ for label in labels[:-1]:
+ if not (0 < len(label) < 64):
+ raise UnicodeError("label empty or too long")
+ if len(labels[-1]) >= 64:
+ raise UnicodeError("label too long")
+ return result, len(input)
+
+ result = bytearray()
+ labels = dots.split(input)
+ if labels and not labels[-1]:
+ trailing_dot = b'.'
+ del labels[-1]
+ else:
+ trailing_dot = b''
+ for label in labels:
+ if result:
+ # Join with U+002E
+ result.extend(b'.')
+ result.extend(ToASCII(label))
+ return bytes(result+trailing_dot), len(input)
+
+ def decode(self, input, errors='strict'):
+
+ if errors != 'strict':
+ raise UnicodeError("Unsupported error handling "+errors)
+
+ if not input:
+ return "", 0
+
+ # IDNA allows decoding to operate on Unicode strings, too.
+ if not isinstance(input, bytes):
+ # XXX obviously wrong, see #3232
+ input = bytes(input)
+
+ if ace_prefix not in input:
+ # Fast path
+ try:
+ return input.decode('ascii'), len(input)
+ except UnicodeDecodeError:
+ pass
+
+ labels = input.split(b".")
+
+ if labels and len(labels[-1]) == 0:
+ trailing_dot = '.'
+ del labels[-1]
+ else:
+ trailing_dot = ''
+
+ result = []
+ for label in labels:
+ result.append(ToUnicode(label))
+
+ return ".".join(result)+trailing_dot, len(input)
+
+class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
+ def _buffer_encode(self, input, errors, final):
+ if errors != 'strict':
+ # IDNA is quite clear that implementations must be strict
+ raise UnicodeError("unsupported error handling "+errors)
+
+ if not input:
+ return (b'', 0)
+
+ labels = dots.split(input)
+ trailing_dot = b''
+ if labels:
+ if not labels[-1]:
+ trailing_dot = b'.'
+ del labels[-1]
+ elif not final:
+ # Keep potentially unfinished label until the next call
+ del labels[-1]
+ if labels:
+ trailing_dot = b'.'
+
+ result = bytearray()
+ size = 0
+ for label in labels:
+ if size:
+ # Join with U+002E
+ result.extend(b'.')
+ size += 1
+ result.extend(ToASCII(label))
+ size += len(label)
+
+ result += trailing_dot
+ size += len(trailing_dot)
+ return (bytes(result), size)
+
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+ def _buffer_decode(self, input, errors, final):
+ if errors != 'strict':
+ raise UnicodeError("Unsupported error handling "+errors)
+
+ if not input:
+ return ("", 0)
+
+ # IDNA allows decoding to operate on Unicode strings, too.
+ if isinstance(input, str):
+ labels = dots.split(input)
+ else:
+ # Must be ASCII string
+ input = str(input, "ascii")
+ labels = input.split(".")
+
+ trailing_dot = ''
+ if labels:
+ if not labels[-1]:
+ trailing_dot = '.'
+ del labels[-1]
+ elif not final:
+ # Keep potentially unfinished label until the next call
+ del labels[-1]
+ if labels:
+ trailing_dot = '.'
+
+ result = []
+ size = 0
+ for label in labels:
+ result.append(ToUnicode(label))
+ if size:
+ size += 1
+ size += len(label)
+
+ result = ".".join(result) + trailing_dot
+ size += len(trailing_dot)
+ return (result, size)
+
+class StreamWriter(Codec,codecs.StreamWriter):
+ pass
+
+class StreamReader(Codec,codecs.StreamReader):
+ pass
+
+### encodings module API
+
+def getregentry():
+ return codecs.CodecInfo(
+ name='idna',
+ encode=Codec().encode,
+ decode=Codec().decode,
+ incrementalencoder=IncrementalEncoder,
+ incrementaldecoder=IncrementalDecoder,
+ streamwriter=StreamWriter,
+ streamreader=StreamReader,
+ )