diff options
Diffstat (limited to 'contrib/tools/python3/Lib/email/_header_value_parser.py')
| -rw-r--r-- | contrib/tools/python3/Lib/email/_header_value_parser.py | 139 |
1 files changed, 95 insertions, 44 deletions
diff --git a/contrib/tools/python3/Lib/email/_header_value_parser.py b/contrib/tools/python3/Lib/email/_header_value_parser.py index 3d845c09d41..03fedd99539 100644 --- a/contrib/tools/python3/Lib/email/_header_value_parser.py +++ b/contrib/tools/python3/Lib/email/_header_value_parser.py @@ -80,7 +80,8 @@ from email import utils # Useful constants and functions # -WSP = set(' \t') +_WSP = ' \t' +WSP = set(_WSP) CFWS_LEADER = WSP | set('(') SPECIALS = set(r'()<>@,:;.\"[]') ATOM_ENDS = SPECIALS | WSP @@ -101,6 +102,12 @@ def make_quoted_pairs(value): return str(value).replace('\\', '\\\\').replace('"', '\\"') +def make_parenthesis_pairs(value): + """Escape parenthesis and backslash for use within a comment.""" + return str(value).replace('\\', '\\\\') \ + .replace('(', '\\(').replace(')', '\\)') + + def quote_string(value): escaped = make_quoted_pairs(value) return f'"{escaped}"' @@ -874,6 +881,12 @@ class MessageID(MsgID): class InvalidMessageID(MessageID): token_type = 'invalid-message-id' +class MessageIDList(TokenList): + token_type = 'message-id-list' + + @property + def message_ids(self): + return [x for x in self if x.token_type=='msg-id'] class Header(TokenList): token_type = 'header' @@ -933,7 +946,7 @@ class WhiteSpaceTerminal(Terminal): return ' ' def startswith_fws(self): - return True + return self and self[0] in WSP class ValueTerminal(Terminal): @@ -1020,6 +1033,8 @@ def _get_ptext_to_endchars(value, endchars): a flag that is True iff there were any quoted printables decoded. """ + if not value: + return '', '', False fragment, *remainder = _wsp_splitter(value, 1) vchars = [] escape = False @@ -1053,7 +1068,7 @@ def get_fws(value): fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws') return fws, newvalue -def get_encoded_word(value): +def get_encoded_word(value, terminal_type='vtext'): """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" """ @@ -1092,7 +1107,7 @@ def get_encoded_word(value): ew.append(token) continue chars, *remainder = _wsp_splitter(text, 1) - vtext = ValueTerminal(chars, 'vtext') + vtext = ValueTerminal(chars, terminal_type) _validate_xtext(vtext) ew.append(vtext) text = ''.join(remainder) @@ -1134,7 +1149,7 @@ def get_unstructured(value): valid_ew = True if value.startswith('=?'): try: - token, value = get_encoded_word(value) + token, value = get_encoded_word(value, 'utext') except _InvalidEwError: valid_ew = False except errors.HeaderParseError: @@ -1163,7 +1178,7 @@ def get_unstructured(value): # the parser to go in an infinite loop. if valid_ew and rfc2047_matcher.search(tok): tok, *remainder = value.partition('=?') - vtext = ValueTerminal(tok, 'vtext') + vtext = ValueTerminal(tok, 'utext') _validate_xtext(vtext) unstructured.append(vtext) value = ''.join(remainder) @@ -1573,7 +1588,7 @@ def get_dtext(value): def _check_for_early_dl_end(value, domain_literal): if value: return False - domain_literal.append(errors.InvalidHeaderDefect( + domain_literal.defects.append(errors.InvalidHeaderDefect( "end of input inside domain-literal")) domain_literal.append(ValueTerminal(']', 'domain-literal-end')) return True @@ -1592,9 +1607,9 @@ def get_domain_literal(value): raise errors.HeaderParseError("expected '[' at start of domain-literal " "but found '{}'".format(value)) value = value[1:] + domain_literal.append(ValueTerminal('[', 'domain-literal-start')) if _check_for_early_dl_end(value, domain_literal): return domain_literal, value - domain_literal.append(ValueTerminal('[', 'domain-literal-start')) if value[0] in WSP: token, value = get_fws(value) domain_literal.append(token) @@ -2169,6 +2184,32 @@ def parse_message_id(value): return message_id +def parse_message_ids(value): + """in-reply-to = "In-Reply-To:" 1*msg-id CRLF + references = "References:" 1*msg-id CRLF + """ + message_id_list = MessageIDList() + while value: + if value[0] == ',': + # message id list separated with commas - this is invalid, + # but happens rather frequently in the wild + message_id_list.defects.append( + errors.InvalidHeaderDefect("comma in msg-id list")) + message_id_list.append( + WhiteSpaceTerminal(' ', 'invalid-comma-replacement')) + value = value[1:] + continue + try: + token, value = get_msg_id(value) + message_id_list.append(token) + except errors.HeaderParseError as ex: + token = get_unstructured(value) + message_id_list.append(InvalidMessageID(token)) + message_id_list.defects.append( + errors.InvalidHeaderDefect("Invalid msg-id: {!r}".format(ex))) + break + return message_id_list + # # XXX: As I begin to add additional header parsers, I'm realizing we probably # have two level of parser routines: the get_XXX methods that get a token in @@ -2786,8 +2827,12 @@ def _steal_trailing_WSP_if_exists(lines): if lines and lines[-1] and lines[-1][-1] in WSP: wsp = lines[-1][-1] lines[-1] = lines[-1][:-1] + # gh-142006: if the line is now empty, remove it entirely. + if not lines[-1]: + lines.pop() return wsp + def _refold_parse_tree(parse_tree, *, policy): """Return string of contents of parse_tree folded according to RFC rules. @@ -2796,11 +2841,9 @@ def _refold_parse_tree(parse_tree, *, policy): maxlen = policy.max_line_length or sys.maxsize encoding = 'utf-8' if policy.utf8 else 'us-ascii' lines = [''] # Folded lines to be output - leading_whitespace = '' # When we have whitespace between two encoded - # words, we may need to encode the whitespace - # at the beginning of the second word. - last_ew = None # Points to the last encoded character if there's an ew on - # the line + last_word_is_ew = False + last_ew = None # if there is an encoded word in the last line of lines, + # points to the encoded word's first character last_charset = None wrap_as_ew_blocked = 0 want_encoding = False # This is set to True if we need to encode this part @@ -2813,7 +2856,7 @@ def _refold_parse_tree(parse_tree, *, policy): continue tstr = str(part) if not want_encoding: - if part.token_type == 'ptext': + if part.token_type in ('ptext', 'vtext'): # Encode if tstr contains special characters. want_encoding = not SPECIALSNL.isdisjoint(tstr) else: @@ -2835,6 +2878,7 @@ def _refold_parse_tree(parse_tree, *, policy): if part.token_type == 'mime-parameters': # Mime parameter folding (using RFC2231) is extra special. _fold_mime_parameters(part, lines, maxlen, encoding) + last_word_is_ew = False continue if want_encoding and not wrap_as_ew_blocked: @@ -2851,6 +2895,7 @@ def _refold_parse_tree(parse_tree, *, policy): # XXX what if encoded_part has no leading FWS? lines.append(newline) lines[-1] += encoded_part + last_word_is_ew = False continue # Either this is not a major syntactic break, so we don't # want it on a line by itself even if it fits, or it @@ -2869,11 +2914,16 @@ def _refold_parse_tree(parse_tree, *, policy): (last_charset == 'unknown-8bit' or last_charset == 'utf-8' and charset != 'us-ascii')): last_ew = None - last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew, - part.ew_combine_allowed, charset, leading_whitespace) - # This whitespace has been added to the lines in _fold_as_ew() - # so clear it now. - leading_whitespace = '' + last_ew = _fold_as_ew( + tstr, + lines, + maxlen, + last_ew, + part.ew_combine_allowed, + charset, + last_word_is_ew, + ) + last_word_is_ew = True last_charset = charset want_encoding = False continue @@ -2886,28 +2936,19 @@ def _refold_parse_tree(parse_tree, *, policy): if len(tstr) <= maxlen - len(lines[-1]): lines[-1] += tstr + last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP)) continue # This part is too long to fit. The RFC wants us to break at # "major syntactic breaks", so unless we don't consider this # to be one, check if it will fit on the next line by itself. - leading_whitespace = '' if (part.syntactic_break and len(tstr) + 1 <= maxlen): newline = _steal_trailing_WSP_if_exists(lines) if newline or part.startswith_fws(): - # We're going to fold the data onto a new line here. Due to - # the way encoded strings handle continuation lines, we need to - # be prepared to encode any whitespace if the next line turns - # out to start with an encoded word. lines.append(newline + tstr) - - whitespace_accumulator = [] - for char in lines[-1]: - if char not in WSP: - break - whitespace_accumulator.append(char) - leading_whitespace = ''.join(whitespace_accumulator) + last_word_is_ew = (last_word_is_ew + and not bool(lines[-1].strip(_WSP))) last_ew = None continue if not hasattr(part, 'encode'): @@ -2922,6 +2963,13 @@ def _refold_parse_tree(parse_tree, *, policy): [ValueTerminal(make_quoted_pairs(p), 'ptext') for p in newparts] + [ValueTerminal('"', 'ptext')]) + if part.token_type == 'comment': + newparts = ( + [ValueTerminal('(', 'ptext')] + + [ValueTerminal(make_parenthesis_pairs(p), 'ptext') + if p.token_type == 'ptext' else p + for p in newparts] + + [ValueTerminal(')', 'ptext')]) if not part.as_ew_allowed: wrap_as_ew_blocked += 1 newparts.append(end_ew_not_allowed) @@ -2940,10 +2988,11 @@ def _refold_parse_tree(parse_tree, *, policy): else: # We can't fold it onto the next line either... lines[-1] += tstr + last_word_is_ew = last_word_is_ew and not bool(tstr.strip(_WSP)) return policy.linesep.join(lines) + policy.linesep -def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace): +def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, last_word_is_ew): """Fold string to_encode into lines as encoded word, combining if allowed. Return the new value for last_ew, or None if ew_combine_allowed is False. @@ -2958,6 +3007,16 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, to_encode = str( get_unstructured(lines[-1][last_ew:] + to_encode)) lines[-1] = lines[-1][:last_ew] + elif last_word_is_ew: + # If we are following up an encoded word with another encoded word, + # any white space between the two will be ignored when decoded. + # Therefore, we encode all to-be-displayed whitespace in the second + # encoded word. + len_without_wsp = len(lines[-1].rstrip(_WSP)) + leading_whitespace = lines[-1][len_without_wsp:] + lines[-1] = (lines[-1][:len_without_wsp] + + (' ' if leading_whitespace else '')) + to_encode = leading_whitespace + to_encode elif to_encode[0] in WSP: # We're joining this to non-encoded text, so don't encode # the leading blank. @@ -2986,20 +3045,13 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, while to_encode: remaining_space = maxlen - len(lines[-1]) - text_space = remaining_space - chrome_len - len(leading_whitespace) + text_space = remaining_space - chrome_len if text_space <= 0: - lines.append(' ') + newline = _steal_trailing_WSP_if_exists(lines) + lines.append(newline or ' ') + new_last_ew = len(lines[-1]) continue - # If we are at the start of a continuation line, prepend whitespace - # (we only want to do this when the line starts with an encoded word - # but if we're folding in this helper function, then we know that we - # are going to be writing out an encoded word.) - if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace: - encoded_word = _ew.encode(leading_whitespace, charset=encode_as) - lines[-1] += encoded_word - leading_whitespace = '' - to_encode_word = to_encode[:text_space] encoded_word = _ew.encode(to_encode_word, charset=encode_as) excess = len(encoded_word) - remaining_space @@ -3011,7 +3063,6 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, excess = len(encoded_word) - remaining_space lines[-1] += encoded_word to_encode = to_encode[len(to_encode_word):] - leading_whitespace = '' if to_encode: lines.append(' ') |
