chat-archiver: Some cleanups to the URL matching regex

With thanks to Me-Me for review
pull/408/head
Mike Lang 2 months ago committed by Mike Lang
parent 15f86551d4
commit 7b590cf574

@ -354,7 +354,7 @@ URL_REGEX = re.compile(r"""
# Previous char is not a letter. This prevents eg. "foohttp://example.com" # Previous char is not a letter. This prevents eg. "foohttp://example.com"
# Also disallows / as the previous character, otherwise "file:///foo.bar/baz" # Also disallows / as the previous character, otherwise "file:///foo.bar/baz"
# can match on the "foo.bar/baz" part. # can match on the "foo.bar/baz" part.
(?<! \w | / ) (?<! [\w/] )
# optional scheme, which must be http or https (we don't want other schemes) # optional scheme, which must be http or https (we don't want other schemes)
(?P<scheme> https?:// )? (?P<scheme> https?:// )?
# Hostname, which must contain a dot. Single-part hostnames like "localhost" are valid # Hostname, which must contain a dot. Single-part hostnames like "localhost" are valid
@ -372,8 +372,8 @@ URL_REGEX = re.compile(r"""
# like that even though it's encoded when actually sent as a URL. # like that even though it's encoded when actually sent as a URL.
# Restricting this to letters prevents things like non-breaking spaces causing problems. # Restricting this to letters prevents things like non-breaking spaces causing problems.
# For the same reason we also allow {} and [] which seem to show up often in paths. # For the same reason we also allow {} and [] which seem to show up often in paths.
(?P<path> / (\w | [!#$%&'()*+,./:;=?@_~{}-] | \[ | \] )* )? (?P<path> / [\w!#$%&'()*+,./:;=?@_~{}\[\]-]* )?
""", re.ASCII | re.VERBOSE | re.IGNORECASE) """, re.VERBOSE | re.IGNORECASE)
_IMAGE_LINKS_RUNNING = KeyedGroup() _IMAGE_LINKS_RUNNING = KeyedGroup()

Loading…
Cancel
Save