From 7b590cf574f715860ad2802eca08c7df60b62412 Mon Sep 17 00:00:00 2001 From: Mike Lang Date: Sat, 14 Sep 2024 13:27:15 +1000 Subject: [PATCH] chat-archiver: Some cleanups to the URL matching regex With thanks to Me-Me for review --- chat_archiver/chat_archiver/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/chat_archiver/chat_archiver/main.py b/chat_archiver/chat_archiver/main.py index 7d4d051..1aae410 100644 --- a/chat_archiver/chat_archiver/main.py +++ b/chat_archiver/chat_archiver/main.py @@ -354,7 +354,7 @@ URL_REGEX = re.compile(r""" # Previous char is not a letter. This prevents eg. "foohttp://example.com" # Also disallows / as the previous character, otherwise "file:///foo.bar/baz" # can match on the "foo.bar/baz" part. - (? https?:// )? # Hostname, which must contain a dot. Single-part hostnames like "localhost" are valid @@ -372,8 +372,8 @@ URL_REGEX = re.compile(r""" # like that even though it's encoded when actually sent as a URL. # Restricting this to letters prevents things like non-breaking spaces causing problems. # For the same reason we also allow {} and [] which seem to show up often in paths. - (?P / (\w | [!#$%&'()*+,./:;=?@_~{}-] | \[ | \] )* )? -""", re.ASCII | re.VERBOSE | re.IGNORECASE) + (?P / [\w!#$%&'()*+,./:;=?@_~{}\[\]-]* )? +""", re.VERBOSE | re.IGNORECASE) _IMAGE_LINKS_RUNNING = KeyedGroup()