Wget: Difference between revisions
(4 intermediate revisions by the same user not shown) | |||
Line 16: | Line 16: | ||
What are the options used here (from wget manpage): | What are the options used here (from wget manpage): | ||
-E<br /> | '''-E'''<br /> | ||
--html-extension | '''--html-extension''' | ||
If a file of type application/xhtml+xml or text/html is downloaded | If a file of type application/xhtml+xml or text/html is downloaded | ||
Line 29: | Line 29: | ||
article.cgi?25.html. | article.cgi?25.html. | ||
-r | '''-r''' | ||
Recursive: Meaning it downloads links as well. | Recursive: Meaning it downloads links as well. | ||
'''-k''' | |||
'''--convert-links''' | |||
After the download is complete, convert the links in the document | |||
to make them suitable for local viewing. This affects not only the | |||
visible hyperlinks, but any part of the document that links to | |||
external content, such as embedded images, links to style sheets, | |||
hyperlinks to non-HTML content, etc. | |||
'''-nH''' | |||
Turns off creating a top-level folder. | |||
=== Grabbing images === | === Grabbing images === | ||
Line 38: | Line 50: | ||
<source lang="bash"> | <source lang="bash"> | ||
wget -r -nd -np --follow-tags=img -A.jpg,.jpeg http://www.colourlovers.com | wget -e robots=off -r -nd -np --follow-tags=img -A.jpg,.jpeg http://www.colourlovers.com | ||
montage *.jpg ../public_html/montage.jpg | montage *.jpg ../public_html/montage.jpg | ||
</source> | </source> | ||
Line 71: | Line 83: | ||
thepage = f.read() | thepage = f.read() | ||
cal = ics.parseData(thepage) # or just "raw" data (string) | cal = ics.parseData(thepage) # or just "raw" data (string) | ||
</source> | |||
<source lang="python"> | |||
import urllib2, urlparse | |||
import html5lib, os | |||
def absolutize (href, base): | |||
if not href.lower().startswith("http://"): | |||
return urlparse.urljoin(base, href) | |||
return href | |||
def openURL (url, data): | |||
""" | |||
returns (page, actualurl) | |||
sets user_agent and resolves possible redirection | |||
realurl maybe different than url in the case of a redirect | |||
""" | |||
request = urllib2.Request(url) | |||
user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14" | |||
request.add_header("User-Agent", user_agent) | |||
pagefile=urllib2.urlopen(request, data) | |||
realurl = pagefile.geturl() | |||
return (pagefile, realurl) | |||
def downloadURL (url, foldername): | |||
""" | |||
returns (page, actualurl) | |||
sets user_agent and resolves possible redirection | |||
realurl maybe different than url in the case of a redirect | |||
""" | |||
request = urllib2.Request(url) | |||
user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14" | |||
request.add_header("User-Agent", user_agent) | |||
pagefile=urllib2.urlopen(request) | |||
realurl = pagefile.geturl() | |||
data = pagefile.read() | |||
urlpath = urlparse.urlparse(url)[2] | |||
(path, filename) = os.path.split(urlpath) | |||
filename = os.path.join(foldername, filename) | |||
out = open(filename, "wb") | |||
out.write(data) | |||
url = "http://www.ah.nl/previouslybought/PreviouslyBought.do" | |||
# f, url2 = openURL(url, "cardNumber=2620480991698") | |||
for line in open("postdata.txt"): | |||
data = line.strip() | |||
f, url2 = openURL(url, "fooo"+data+"&alkfjlaskdjf") | |||
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")) | |||
tree = parser.parse(f) | |||
f.close() | |||
tree.normalize() | |||
for node in tree.getElementsByTagName("img"): | |||
src = node.getAttribute("src") | |||
if src: | |||
src = absolutize(src, url2) | |||
print src | |||
downloadURL(src, "output") | |||
</source> | </source> |
Latest revision as of 16:42, 17 November 2009
Program to download files from the Web. Includes powerful "recursive" features that allow easily downloading entire portions of sites including linked images / other resources.
Examples
"Mirroring" a site
wget -Erk -nH 'http://www.yoursite.com/yourdirectory/yourfile.php"
Source: http://www.dreamincode.net/forums/showtopic8317.htm
What are the options used here (from wget manpage):
-E
--html-extension
If a file of type application/xhtml+xml or text/html is downloaded and the URL does not end with the regexp \.[Hh][Tt][Mm][Ll]?, this option will cause the suffix .html to be appended to the local filename. This is useful, for instance, when you’re mirroring a remote site that uses .asp pages, but you want the mirrored pages to be viewable on your stock Apache server. Another good use for this is when you’re downloading CGI-generated materials. A URL like http://site.com/article.cgi?25 will be saved as article.cgi?25.html.
-r
Recursive: Meaning it downloads links as well.
-k --convert-links
After the download is complete, convert the links in the document to make them suitable for local viewing. This affects not only the visible hyperlinks, but any part of the document that links to external content, such as embedded images, links to style sheets, hyperlinks to non-HTML content, etc.
-nH Turns off creating a top-level folder.
Grabbing images
This simple two-line script uses wget to collect all the jpeg's from a website, then uses ImageMagick's montage tool to combine them in a single image.
wget -e robots=off -r -nd -np --follow-tags=img -A.jpg,.jpeg http://www.colourlovers.com
montage *.jpg ../public_html/montage.jpg
Doing the same thing in Python
import urllib2
def wget (url):
"""
returns (page, actualurl)
sets user_agent and resolves possible redirection
realurl maybe different than url in the case of a redirect
"""
request = urllib2.Request(url)
user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
request.add_header("User-Agent", user_agent)
pagefile=urllib2.urlopen(request)
realurl = pagefile.geturl()
return (pagefile, realurl)
if __name__ == "__main__":
# Example to use...
theurl = 'http://pzwart2.wdka.hro.nl/ical/schedule.ics'
(f, theurl) = wget(theurl)
# print thepage
# 1.
cal = ics.parseData(f) # accepts file objects?
# or 2.
thepage = f.read()
cal = ics.parseData(thepage) # or just "raw" data (string)
import urllib2, urlparse
import html5lib, os
def absolutize (href, base):
if not href.lower().startswith("http://"):
return urlparse.urljoin(base, href)
return href
def openURL (url, data):
"""
returns (page, actualurl)
sets user_agent and resolves possible redirection
realurl maybe different than url in the case of a redirect
"""
request = urllib2.Request(url)
user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
request.add_header("User-Agent", user_agent)
pagefile=urllib2.urlopen(request, data)
realurl = pagefile.geturl()
return (pagefile, realurl)
def downloadURL (url, foldername):
"""
returns (page, actualurl)
sets user_agent and resolves possible redirection
realurl maybe different than url in the case of a redirect
"""
request = urllib2.Request(url)
user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
request.add_header("User-Agent", user_agent)
pagefile=urllib2.urlopen(request)
realurl = pagefile.geturl()
data = pagefile.read()
urlpath = urlparse.urlparse(url)[2]
(path, filename) = os.path.split(urlpath)
filename = os.path.join(foldername, filename)
out = open(filename, "wb")
out.write(data)
url = "http://www.ah.nl/previouslybought/PreviouslyBought.do"
# f, url2 = openURL(url, "cardNumber=2620480991698")
for line in open("postdata.txt"):
data = line.strip()
f, url2 = openURL(url, "fooo"+data+"&alkfjlaskdjf")
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
tree = parser.parse(f)
f.close()
tree.normalize()
for node in tree.getElementsByTagName("img"):
src = node.getAttribute("src")
if src:
src = absolutize(src, url2)
print src
downloadURL(src, "output")