Revision as of 19:03, 12 January 2011

Get the URL of all images on a page

import urllib2, urlparse, html5lib, lxml
from lxml.cssselect import CSSSelector

request = urllib2.Request("http://www.volkskrant.nl/")
request.add_header("User-Agent", "Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.1.5) Gecko/20091109 Ubuntu/9.10 (karmic) Firefox/3.5.5")
f=urllib2.urlopen(request)

# f.geturl(), f.info()

parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = parser.parse(f)

for elt in CSSSelector('img[src]')(page):
    href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
    print href

Revision as of 19:01, 12 January 2011 (view source) Michael Murtaugh (talk \| contribs) (Created page with "= Get the URL of all images on a page = <source lang="python"> import urllib2, urlparse, html5lib, lxml from lxml.cssselect import CSSSelector request = urllib2.Request("http:/...")		Revision as of 19:03, 12 January 2011 (view source) Michael Murtaugh (talk \| contribs) No edit summary Newer edit →
Line 19:		Line 19:

	</source>		</source>

			[[Category:Cookbook]]