Wget: Difference between revisions

From XPUB & Lens-Based wiki
No edit summary
 
(8 intermediate revisions by the same user not shown)
Line 5: Line 5:


== Examples ==
== Examples ==
=== "Mirroring" a site ===
<source lang="bash">
wget -Erk -nH 'http://www.yoursite.com/yourdirectory/yourfile.php"
</source>
Source: http://www.dreamincode.net/forums/showtopic8317.htm
What are the options used here (from wget manpage):
'''-E'''<br />
'''--html-extension'''
If a file of type application/xhtml+xml or text/html is downloaded
and the URL does not end with the regexp \.[Hh][Tt][Mm][Ll]?, this
option will cause the suffix .html to be appended to the local
filename.  This is useful, for instance, when you’re mirroring a
remote site that uses .asp pages, but you want the mirrored pages
to be viewable on your stock Apache server.  Another good use for
this is when you’re downloading CGI-generated materials.  A URL
like http://site.com/article.cgi?25 will be saved as
article.cgi?25.html.
'''-r'''
Recursive: Meaning it downloads links as well.
'''-k'''
'''--convert-links'''
After the download is complete, convert the links in the document
to make them suitable for local viewing.  This affects not only the
visible hyperlinks, but any part of the document that links to
external content, such as embedded images, links to style sheets,
hyperlinks to non-HTML content, etc.
'''-nH'''
Turns off creating a top-level folder.
=== Grabbing images ===


This simple two-line script uses wget to collect all the jpeg's from a website, then uses [[ImageMagick]]'s montage tool to combine them in a single image.
This simple two-line script uses wget to collect all the jpeg's from a website, then uses [[ImageMagick]]'s montage tool to combine them in a single image.


<source lang="bash">
<source lang="bash">
wget -r -nd -np --follow-tags=img -A.jpg,.jpeg http://www.colourlovers.com
wget -e robots=off -r -nd -np --follow-tags=img -A.jpg,.jpeg http://www.colourlovers.com
montage *.jpg ../public_html/montage.jpg  
montage *.jpg ../public_html/montage.jpg  
</source>
== Doing the same thing in Python ==
<source lang="python">
import urllib2
def wget (url):
"""
returns (page, actualurl)
sets user_agent and resolves possible redirection
realurl maybe different than url in the case of a redirect
"""
request = urllib2.Request(url)
user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
request.add_header("User-Agent", user_agent)
pagefile=urllib2.urlopen(request)
realurl = pagefile.geturl()
return (pagefile, realurl)
if __name__ == "__main__":
# Example to use...
theurl = 'http://pzwart2.wdka.hro.nl/ical/schedule.ics'
(f, theurl) = wget(theurl)
# print thepage
# 1.
cal = ics.parseData(f) # accepts file objects?
# or 2.
thepage = f.read()
cal = ics.parseData(thepage) # or just "raw" data (string)
</source>
<source lang="python">
import urllib2, urlparse
import html5lib, os
def absolutize (href, base):
    if not href.lower().startswith("http://"):
        return urlparse.urljoin(base, href)
    return href
def openURL (url, data):
    """
    returns (page, actualurl)
    sets user_agent and resolves possible redirection
    realurl maybe different than url in the case of a redirect
    """   
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
    request.add_header("User-Agent", user_agent)
    pagefile=urllib2.urlopen(request, data)
    realurl = pagefile.geturl()
    return (pagefile, realurl)
def downloadURL (url, foldername):
    """
    returns (page, actualurl)
    sets user_agent and resolves possible redirection
    realurl maybe different than url in the case of a redirect
    """   
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
    request.add_header("User-Agent", user_agent)
    pagefile=urllib2.urlopen(request)
    realurl = pagefile.geturl()
    data = pagefile.read()
    urlpath = urlparse.urlparse(url)[2]
    (path, filename) = os.path.split(urlpath)
    filename = os.path.join(foldername, filename)
    out = open(filename, "wb")
    out.write(data)
url = "http://www.ah.nl/previouslybought/PreviouslyBought.do"
# f, url2 = openURL(url, "cardNumber=2620480991698")
for line in open("postdata.txt"):
    data = line.strip()
    f, url2 = openURL(url, "fooo"+data+"&alkfjlaskdjf")
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
tree = parser.parse(f)
f.close()
tree.normalize()
for node in tree.getElementsByTagName("img"):
    src = node.getAttribute("src")
    if src:
        src = absolutize(src, url2)
        print src
        downloadURL(src, "output")
</source>
</source>

Latest revision as of 16:42, 17 November 2009

Program to download files from the Web. Includes powerful "recursive" features that allow easily downloading entire portions of sites including linked images / other resources.

Examples

"Mirroring" a site

wget -Erk -nH 'http://www.yoursite.com/yourdirectory/yourfile.php"

Source: http://www.dreamincode.net/forums/showtopic8317.htm

What are the options used here (from wget manpage):

-E
--html-extension

If a file of type application/xhtml+xml or text/html is downloaded and the URL does not end with the regexp \.[Hh][Tt][Mm][Ll]?, this option will cause the suffix .html to be appended to the local filename. This is useful, for instance, when you’re mirroring a remote site that uses .asp pages, but you want the mirrored pages to be viewable on your stock Apache server. Another good use for this is when you’re downloading CGI-generated materials. A URL like http://site.com/article.cgi?25 will be saved as article.cgi?25.html.

-r

Recursive: Meaning it downloads links as well.

-k --convert-links

After the download is complete, convert the links in the document to make them suitable for local viewing. This affects not only the visible hyperlinks, but any part of the document that links to external content, such as embedded images, links to style sheets, hyperlinks to non-HTML content, etc.

-nH Turns off creating a top-level folder.

Grabbing images

This simple two-line script uses wget to collect all the jpeg's from a website, then uses ImageMagick's montage tool to combine them in a single image.

wget -e robots=off -r -nd -np --follow-tags=img -A.jpg,.jpeg http://www.colourlovers.com
montage *.jpg ../public_html/montage.jpg

Doing the same thing in Python

import urllib2

def wget (url):
	"""
	returns (page, actualurl)
	sets user_agent and resolves possible redirection
	realurl maybe different than url in the case of a redirect
	"""	
	request = urllib2.Request(url)
	user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
	request.add_header("User-Agent", user_agent)
	pagefile=urllib2.urlopen(request)
	realurl = pagefile.geturl()
	return (pagefile, realurl)

if __name__ == "__main__":
# Example to use...	
	theurl = 'http://pzwart2.wdka.hro.nl/ical/schedule.ics'
	(f, theurl) = wget(theurl)
	# print thepage

	# 1.
	cal = ics.parseData(f) # accepts file objects?

	# or 2.
	thepage = f.read()
	cal = ics.parseData(thepage) # or just "raw" data (string)
import urllib2, urlparse
import html5lib, os
 
def absolutize (href, base):
    if not href.lower().startswith("http://"):
        return urlparse.urljoin(base, href)
    return href

def openURL (url, data):
    """
    returns (page, actualurl)
    sets user_agent and resolves possible redirection
    realurl maybe different than url in the case of a redirect
    """    
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
    request.add_header("User-Agent", user_agent)
    pagefile=urllib2.urlopen(request, data)
    realurl = pagefile.geturl()
    return (pagefile, realurl)

def downloadURL (url, foldername):
    """
    returns (page, actualurl)
    sets user_agent and resolves possible redirection
    realurl maybe different than url in the case of a redirect
    """    
    request = urllib2.Request(url)
    user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14"
    request.add_header("User-Agent", user_agent)
    pagefile=urllib2.urlopen(request)
    realurl = pagefile.geturl()
    data = pagefile.read()

    urlpath = urlparse.urlparse(url)[2]
    (path, filename) = os.path.split(urlpath)
    filename = os.path.join(foldername, filename)
    out = open(filename, "wb")
    out.write(data)

url = "http://www.ah.nl/previouslybought/PreviouslyBought.do"
# f, url2 = openURL(url, "cardNumber=2620480991698")

for line in open("postdata.txt"):
    data = line.strip()
    f, url2 = openURL(url, "fooo"+data+"&alkfjlaskdjf")

parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
tree = parser.parse(f)
f.close()
tree.normalize()
for node in tree.getElementsByTagName("img"):
    src = node.getAttribute("src")
    if src:
        src = absolutize(src, url2)
        print src
        downloadURL(src, "output")