Wget: Difference between revisions

Latest revision as of 17:42, 17 November 2009

Program to download files from the Web. Includes powerful "recursive" features that allow easily downloading entire portions of sites including linked images / other resources.

Examples

"Mirroring" a site

wget -Erk -nH 'http://www.yoursite.com/yourdirectory/yourfile.php"

Source: http://www.dreamincode.net/forums/showtopic8317.htm

What are the options used here (from wget manpage):

-E
--html-extension

If a file of type application/xhtml+xml or text/html is downloaded and the URL does not end with the regexp \.[Hh][Tt][Mm][Ll]?, this option will cause the suffix .html to be appended to the local filename. This is useful, for instance, when you’re mirroring a remote site that uses .asp pages, but you want the mirrored pages to be viewable on your stock Apache server. Another good use for this is when you’re downloading CGI-generated materials. A URL like http://site.com/article.cgi?25 will be saved as article.cgi?25.html.

-r

Recursive: Meaning it downloads links as well.

-k --convert-links

After the download is complete, convert the links in the document to make them suitable for local viewing. This affects not only the visible hyperlinks, but any part of the document that links to external content, such as embedded images, links to style sheets, hyperlinks to non-HTML content, etc.

-nH Turns off creating a top-level folder.

Grabbing images

This simple two-line script uses wget to collect all the jpeg's from a website, then uses ImageMagick's montage tool to combine them in a single image.

wget -e robots=off -r -nd -np --follow-tags=img -A.jpg,.jpeg http://www.colourlovers.com
montage *.jpg ../public_html/montage.jpg

Doing the same thing in Python

import urllib2

def wget (url):
	"""
	returns (page, actualurl)
	sets user_agent and resolves possible redirection
	realurl maybe different than url in the case of a redirect
	"""	
	request = urllib2.Request(url)
	user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
	request.add_header("User-Agent", user_agent)
	pagefile=urllib2.urlopen(request)
	realurl = pagefile.geturl()
	return (pagefile, realurl)

if __name__ == "__main__":
# Example to use...	
	theurl = 'http://pzwart2.wdka.hro.nl/ical/schedule.ics'
	(f, theurl) = wget(theurl)
	# print thepage

	# 1.
	cal = ics.parseData(f) # accepts file objects?

	# or 2.
	thepage = f.read()
	cal = ics.parseData(thepage) # or just "raw" data (string)

import urllib2, urlparse
import html5lib, os
 
def absolutize (href, base):
    if not href.lower().startswith("http://"):
        return urlparse.urljoin(base, href)
    return href

def openURL (url, data):
    """
    returns (page, actualurl)
    sets user_agent and resolves possible redirection
    realurl maybe different than url in the case of a redirect
    """    
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
    request.add_header("User-Agent", user_agent)
    pagefile=urllib2.urlopen(request, data)
    realurl = pagefile.geturl()
    return (pagefile, realurl)

def downloadURL (url, foldername):
    """
    returns (page, actualurl)
    sets user_agent and resolves possible redirection
    realurl maybe different than url in the case of a redirect
    """    
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
    request.add_header("User-Agent", user_agent)
    pagefile=urllib2.urlopen(request)
    realurl = pagefile.geturl()
    data = pagefile.read()

    urlpath = urlparse.urlparse(url)[2]
    (path, filename) = os.path.split(urlpath)
    filename = os.path.join(foldername, filename)
    out = open(filename, "wb")
    out.write(data)

url = "http://www.ah.nl/previouslybought/PreviouslyBought.do"
# f, url2 = openURL(url, "cardNumber=2620480991698")

for line in open("postdata.txt"):
    data = line.strip()
    f, url2 = openURL(url, "fooo"+data+"&alkfjlaskdjf")

parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
tree = parser.parse(f)
f.close()
tree.normalize()
for node in tree.getElementsByTagName("img"):
    src = node.getAttribute("src")
    if src:
        src = absolutize(src, url2)
        print src
        downloadURL(src, "output")

@@ Line 5: / Line 5: @@
 == Examples ==
+=== "Mirroring" a site ===
+<source lang="bash">
+wget -Erk -nH 'http://www.yoursite.com/yourdirectory/yourfile.php"
+</source>
+Source: http://www.dreamincode.net/forums/showtopic8317.htm
+What are the options used here (from wget manpage):
+'''-E'''<br />
+'''--html-extension'''
+If a file of type application/xhtml+xml or text/html is downloaded
+and the URL does not end with the regexp \.[Hh][Tt][Mm][Ll]?, this
+option will cause the suffix .html to be appended to the local
+filename.  This is useful, for instance, when you’re mirroring a
+remote site that uses .asp pages, but you want the mirrored pages
+to be viewable on your stock Apache server.  Another good use for
+this is when you’re downloading CGI-generated materials.  A URL
+like http://site.com/article.cgi?25 will be saved as
+article.cgi?25.html.
+'''-r'''
+Recursive: Meaning it downloads links as well.
+'''-k'''
+'''--convert-links'''
+After the download is complete, convert the links in the document
+to make them suitable for local viewing.  This affects not only the
+visible hyperlinks, but any part of the document that links to
+external content, such as embedded images, links to style sheets,
+hyperlinks to non-HTML content, etc.
+'''-nH'''
+Turns off creating a top-level folder.
+=== Grabbing images ===
 This simple two-line script uses wget to collect all the jpeg's from a website, then uses [[ImageMagick]]'s montage tool to combine them in a single image.
 <source lang="bash">
-wget -r -nd -np --follow-tags=img -A.jpg,.jpeg http://www.colourlovers.com
+wget -e robots=off -r -nd -np --follow-tags=img -A.jpg,.jpeg http://www.colourlovers.com
 montage *.jpg ../public_html/montage.jpg
 </source>
-== Doing the same thing with just Python ==
+== Doing the same thing in Python ==
 <source lang="python">
 import urllib2
@@ Line 42: / Line 83: @@
 	thepage = f.read()
 	cal = ics.parseData(thepage) # or just "raw" data (string)
+</source>
+<source lang="python">
+import urllib2, urlparse
+import html5lib, os
+def absolutize (href, base):
+    if not href.lower().startswith("http://"):
+        return urlparse.urljoin(base, href)
+    return href
+def openURL (url, data):
+    """
+    returns (page, actualurl)
+    sets user_agent and resolves possible redirection
+    realurl maybe different than url in the case of a redirect
+    """
+    request = urllib2.Request(url)
+    user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
+    request.add_header("User-Agent", user_agent)
+    pagefile=urllib2.urlopen(request, data)
+    realurl = pagefile.geturl()
+    return (pagefile, realurl)
+def downloadURL (url, foldername):
+    """
+    returns (page, actualurl)
+    sets user_agent and resolves possible redirection
+    realurl maybe different than url in the case of a redirect
+    """
+    request = urllib2.Request(url)
+    user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
+    request.add_header("User-Agent", user_agent)
+    pagefile=urllib2.urlopen(request)
+    realurl = pagefile.geturl()
+    data = pagefile.read()
+    urlpath = urlparse.urlparse(url)[2]
+    (path, filename) = os.path.split(urlpath)
+    filename = os.path.join(foldername, filename)
+    out = open(filename, "wb")
+    out.write(data)
+url = "http://www.ah.nl/previouslybought/PreviouslyBought.do"
+# f, url2 = openURL(url, "cardNumber=2620480991698")
+for line in open("postdata.txt"):
+    data = line.strip()
+    f, url2 = openURL(url, "fooo"+data+"&alkfjlaskdjf")
+parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
+tree = parser.parse(f)
+f.close()
+tree.normalize()
+for node in tree.getElementsByTagName("img"):
+    src = node.getAttribute("src")
+    if src:
+        src = absolutize(src, url2)
+        print src
+        downloadURL(src, "output")
 </source>