This is absolutize.py, version 0.1. Use it to convert relative to absolute URLs in a Web page source. The regular expression has two parts because HTML elements can have attributes values enclosed in single quotes.

This is version 0.1.

#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see .
#
#    Copyright 2009 Henry S. Vieira

import sys
import codecs
import re
import urlparse

def absolutize(base_url, html):
    expression = \
        re.compile('(src|href)\s*=\s*"(\S+)"|(src|href)\s*=\s*\'(\S+)\'')

    for match in re.finditer(expression, html):
        if match.group(2):
            url = match.group(2)
        else:
            url = match.group(4)

        if urlparse.urlparse(url).netloc == '':
            snippet = match.group(0).replace(url,
                                             urlparse.urljoin(base_url, url))
            html = html.replace(match.group(0), snippet)

    return html

def main():
    html = absolutize(sys.argv[1],
                      codecs.open(sys.argv[2], 'r', 'latin-1').read())
    codecs.open(sys.argv[2].replace('.', '-absolute.'),
                'w', 'latin-1').write(html)

if __name__ ==  '__main__':
    main()

0 Responses to “absolutize.py”



  1. Leave a Comment

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s




Twitter Updates

December 2016
S M T W T F S
« Jan    
 123
45678910
11121314151617
18192021222324
25262728293031

%d bloggers like this: