chat-archiver: Some cleanups to the URL matching regex

With thanks to Me-Me for review
Mike Lang 2 months ago
parent 902a78a520
commit 32f573297d

@ -354,7 +354,7 @@ URL_REGEX = re.compile(r"""
# Previous char is not a letter. This prevents eg. "foohttp://example.com"
# Also disallows / as the previous character, otherwise "file:///foo.bar/baz"
# can match on the "foo.bar/baz" part.
(?<! \w | / )
(?<! [\w/] )
# optional scheme, which must be http or https (we don't want other schemes)
(?P<scheme> https?:// )?
# Hostname, which must contain a dot. Single-part hostnames like "localhost" are valid
@ -372,8 +372,8 @@ URL_REGEX = re.compile(r"""
# like that even though it's encoded when actually sent as a URL.
# Restricting this to letters prevents things like non-breaking spaces causing problems.
# For the same reason we also allow {} and [] which seem to show up often in paths.
(?P<path> / (\w | [!#$%&'()*+,./:;=?@_~{}-] | \[ | \] )* )?
""", re.ASCII | re.VERBOSE | re.IGNORECASE)
(?P<path> / [\w!#$%&'()*+,./:;=?@_~{}\[\]-]* )?
""", re.VERBOSE | re.IGNORECASE)
_IMAGE_LINKS_RUNNING = KeyedGroup()

Loading…
Cancel
Save