import os import re import sys # Crazy URL regexp from Gruber # http://daringfireball.net/2010/07/improved_regex_for_matching_urls r = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))') # grep -r for parent, dnames, fnames in os.walk(sys.argv[1]): for fname in fnames: filename = os.path.join(parent, fname) if os.path.isfile(filename): with open(filename) as f: c = 0 for line in f: c = c + 1 match = r.search(line) if match: # <file>:<line>:<match> print '%s:%s:%s' % (filename, c, match.string[match.start():match.end()]) # <match> #print match.string[match.start():match.end()]
Source: Recursive grep-like search for extracting URLs from a bunch of files