Wget: Difference between revisions
No edit summary |
No edit summary |
||
Line 11: | Line 11: | ||
wget -r -nd -np --follow-tags=img -A.jpg,.jpeg http://www.colourlovers.com | wget -r -nd -np --follow-tags=img -A.jpg,.jpeg http://www.colourlovers.com | ||
montage *.jpg ../public_html/montage.jpg | montage *.jpg ../public_html/montage.jpg | ||
</source> | |||
== Doing the same thing with just Python == | |||
<source lang="python"> | |||
import urllib2 | |||
def wget (url): | |||
""" | |||
returns (page, actualurl) | |||
sets user_agent and resolves possible redirection | |||
realurl maybe different than url in the case of a redirect | |||
""" | |||
request = urllib2.Request(url) | |||
user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14" | |||
request.add_header("User-Agent", user_agent) | |||
pagefile=urllib2.urlopen(request) | |||
realurl = pagefile.geturl() | |||
return (pagefile, realurl) | |||
if __name__ == "__main__": | |||
# Example to use... | |||
theurl = 'http://pzwart2.wdka.hro.nl/ical/schedule.ics' | |||
(f, theurl) = wget(theurl) | |||
# print thepage | |||
# 1. | |||
cal = ics.parseData(f) # accepts file objects? | |||
# or 2. | |||
thepage = f.read() | |||
cal = ics.parseData(thepage) # or just "raw" data (string) | |||
</source> | </source> |
Revision as of 16:01, 4 June 2009
Program to download files from the Web. Includes powerful "recursive" features that allow easily downloading entire portions of sites including linked images / other resources.
Examples
This simple two-line script uses wget to collect all the jpeg's from a website, then uses ImageMagick's montage tool to combine them in a single image.
wget -r -nd -np --follow-tags=img -A.jpg,.jpeg http://www.colourlovers.com
montage *.jpg ../public_html/montage.jpg
Doing the same thing with just Python
import urllib2
def wget (url):
"""
returns (page, actualurl)
sets user_agent and resolves possible redirection
realurl maybe different than url in the case of a redirect
"""
request = urllib2.Request(url)
user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
request.add_header("User-Agent", user_agent)
pagefile=urllib2.urlopen(request)
realurl = pagefile.geturl()
return (pagefile, realurl)
if __name__ == "__main__":
# Example to use...
theurl = 'http://pzwart2.wdka.hro.nl/ical/schedule.ics'
(f, theurl) = wget(theurl)
# print thepage
# 1.
cal = ics.parseData(f) # accepts file objects?
# or 2.
thepage = f.read()
cal = ics.parseData(thepage) # or just "raw" data (string)