Recursive grep-like search for extracting URLs from a bunch of files

import os
import re
import sys

# Crazy URL regexp from Gruber
# http://daringfireball.net/2010/07/improved_regex_for_matching_urls
r = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))')

# grep -r
for parent, dnames, fnames in os.walk(sys.argv[1]):
    for fname in fnames:
        filename = os.path.join(parent, fname)
        if os.path.isfile(filename):
            with open(filename) as f:
                c = 0
                for line in f:
                    c = c + 1
                    match = r.search(line)
                    if match:
                        # <file>:<line>:<match>
                        print '%s:%s:%s' % (filename, c, match.string[match.start():match.end()])
                        # <match>
                        #print match.string[match.start():match.end()]

Source: Recursive grep-like search for extracting URLs from a bunch of files

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

This site uses Akismet to reduce spam. Learn how your comment data is processed.