## ## addhrefs.py ## by Peter Bengtsson, 2004-2005, mail@peterbe.com ## ## License: ZPL (http://www.zope.org/Resources/ZPL) ## __doc__='''A little function that puts HTML links into text.''' __version__='0.9.3' __changes__ = ''' 0.9.3 Fixed a bug when text contains "m." but wasnt followed but a a-z. 0.9.2 Added supports for URLs starting with m., mobile. and www2. Added 1 new unit test 0.9.1 Fixed broken link parsing containing {curly brackets} Added 15 new unit tests 0.9 Better support for strings already containing .;:,"') _start_dropouts = list('(<') def _massageURL(url): while url[-1] in _end_dropouts: url = url[:-1] if url[0] in _start_dropouts: url = url[1:] return url def _improveURL(url): # ok_middle_name_starts looks something like this: # ('ftp','http','www.','mobile.','m.','www2.') # If our url here starts with any of those that end in a . # then add http:// to it for each in ok_middle_name_starts: if each.endswith('.') and url.startswith(each): return 'http://'+url return url def _makeLink(url): return '%s'%(_improveURL(url), url) def _makeMailLink(url): return '%s'%(_improveURL(url), url) def _rejectEmail(email, start): if email.startswith("mailto:"): email = email[7:] if email.find(':') > -1: return True return False _bad_in_url = list('!()<>') _dont_start_url = list('@') def _rejectURL(url, start): """ return true if the URL can't be a URL """ if url.lower()=='https': return True for each in _bad_in_url: if url.find(each) > -1: return True whereat = url.find('@') if whereat > -1: for each in "http:// ftp:// https://".split(): url = url.replace(each, '') if not -1 < url.find(':') < whereat: return True if start in _dont_start_url: return True return False def _make_regexp(regexp): _whitespace = "[\s\({}<>\)]" #_not_whitespace = "[^\s\({}<>\)]" _not_whitespace = "[^\s{}<>]" ## don't allow url to end in ( or < but fine with ) or > # _not_whitespace = "[^\s<>\)]" regexp = regexp.replace("\s", _whitespace) regexp = regexp.replace("\S", _not_whitespace) regexp = re.compile(regexp) return regexp ok_middle_name_starts = ('ftp','http','www.','mobile.','m.','www2.') ok = {'start': ('^','\(','{','>','<','@','\s',''), 'middle':('ftp\S+', 'http\S+', 'www\.\w\S+', 'mobile\.\w\S+', 'm\.\w\S+',), 'end':('\)','}','>','\s','$'), } #_url_regex = _make_regexp('((^|\(|<|@|\s|)(ftp\S+|http\S+|www\.\S+)(\)|>|\s|$))') _or = lambda some_list: "|".join(some_list) _url_regex = _make_regexp('((%s)(%s)(%s))'%(_or(ok['start']), _or(ok['middle']), _or(ok['end']))) #_mailto_regex = re.compile('((^|\(|<|\s|)(\S+@\S+\.\S+)(\)|>|\s|$))') _mailto_regex = _make_regexp('((%s)(\S+@\S+\.\S+)(%s))' % (_or(ok['start']), _or(ok['end']))) def addhrefs(text, return_everything=0, emaillinkfunction=_makeMailLink, urllinkfunction=_makeLink): if not callable(emaillinkfunction): if emaillinkfunction is not None: _msg = "%r is not callable email link function" print >>sys.stderr, _msg%emaillinkfunction emaillinkfunction = _makeMailLink if not callable(urllinkfunction): if urllinkfunction is not None: _msg = "%r is not callable URL link function" print >>sys.stderr, _msg%urllinkfunction urllinkfunction = _makeLink info_emails = [] info_urls = [] urls = _url_regex.findall(text) for each in urls: whole, start, url, end = each if whole.endswith('">'): # reject it because it looks like it's taken out of a tag continue if whole.endswith('<'): # the next thing is a tag, if that tag is a # the chicken out! pos = text.find(whole) if text[pos+len(whole)-1:pos+4+len(whole)] == '': continue #print each url = _massageURL(url) if _rejectURL(url, start): continue link = urllinkfunction(url) if return_everything: info_urls.append((url, link)) better = whole.replace(url, link) text = text.replace(whole, better, 1) mails = _mailto_regex.findall(text) for each in mails: # print each whole, start, url, end = each url = _massageURL(url) if _rejectEmail(url, start): continue if url.find(':') > -1: link = urllinkfunction(url) if return_everything: info_urls.append((url, link)) better = whole.replace(url, link) else: link = emaillinkfunction(url) info_emails.append((url, link)) better = whole.replace(url, link) text = text.replace(whole, better) if return_everything: return text, info_urls, info_emails else: return text def test(): raise "TODO", "Move these slowly into testAddhrefs.py" t="this some text http://www.peterbe.com/ with links www.peterbe.com in it" t='''this some text http://www.peterbe.com/ with links www.peterbe.com in it Example''' t2='this some text http://www.peterbe.com/ '\ 'with links www.peterbe.com in it '\ 'Example' print addhrefs(t) t3='''this some text http://www.peterbe.com/ with links www.peterbe.com in it Example www,peterbe.com and www.peterbe.com ''' t4='''https://www.imdb.com (www.peterbe.com/?a=e) asd tra la www.google.com''' t = 'word (www.peterbe.com) word' t = 'word and so on' t = 'Go to: http://www.peterbe.com. There youll find' t = 'Go to: http://www.peterbe.com:' t = '''https://www.imdb.com.''' t = 'Hello mail@peterbe.com to you' t = 'Hello and to you' #t = open('sample-htmlfree.txt').read() t = 'Link1 link www.2.com' t = "Link1 link www.2.com" t = '''1. http://www.peterbe.com 2. www.peterbe.com 3. 4. mail@foobar.com 5. "Name "''' t = 'xxx mail@peterbe.com peter@grenna.net' t += ' xxx www.peterbe.com www.google.com xxx' t = 'mail@peterbe.com 123@a.com or www2.ibm.com or www.ibm.com?asda=ewr&gr:int=34.' t = 'peter@grenna.net 123@a.com ftp://ftp.uk.linux.org/' t = 'http://david:otton@www.something.com david:otton@www.something.com' t = ''' xxx abc ''' t='''www.msn.co.uk http://msn.co.uk http://www.msn.co.uk ftp:/google.com ''' t = 'At http://localhost/ I have apache and at http://localhost:8080 '\ ' I have Zope David used http://enchanter or http://enchanter/' t = 'See http://www.something.com/page?this=that#001' t = 'Bla bla https bla bla and http bla' t = '

mail@peterbe.com

\n\n

www.something.com

' t = '

http://something.com

\n\n

mail@peterbe.com

' t = '

http://example.com

\n\n

kilobug@freesurf.fr

' t += '\n\nhttp://www.dil(bert.com' if __name__=='__main__': test()