|
|
(17 intermediate revisions by 5 users not shown) |
Line 1: |
Line 1: |
| 11-18 | Nicolas Maleve - Thematic Project | | 11-18 | Nicolas Maleve - Thematic Project |
|
| |
|
| = Cookbook Recipes for Goodiff Workshop = | | |
| | === Cookbook Recipes for Goodiff Workshop === |
|
| |
|
| * [[Simplifying_HTML_by_removing_"invisible"_parts]] | | * [[Simplifying_HTML_by_removing_"invisible"_parts]] |
| | * [[Stripping all the tags from HTML to get pure text]] |
| | * [[Looking up synonym-sets for a word]] |
| | * [[Splitting text into sentences]] |
| | * [[Removing common words / stopwords]] |
| | * [[Finding capitalized words]] |
| | * [[Extracting parts of an HTML document]] |
| | * [[Extracting the text contents of a node]] |
| | * [[Turning part of a page back into code (aka serialization)]] |
|
| |
|
| === Stripping all the tags from HTML to get pure text (nltk) ===
| |
|
| |
| You can use nltk.util.clean_html to remove all tags
| |
|
| |
| <source lang="python">
| |
| import nltk.util
| |
| nltk.util.clean_html(source)
| |
| </source>
| |
|
| |
| example:
| |
| <source lang="python">
| |
| nltk.util.clean_html("<html><head><title>Hello</title><script>var foo=3;</script></head><body><p>This is <u>some crazy text</u>. OK!</body></html>")
| |
| </source>
| |
|
| |
| result:
| |
| 'Hello This is some crazy text . OK!'
| |
|
| |
| === Looking up synonym-sets for a word (wordnet) ===
| |
|
| |
| <source lang="python">
| |
| from nltk.corpus import wordnet
| |
| meanings = wordnet.synsets('woman')
| |
| for m in meanings:
| |
| print "===", m.name, "==="
| |
| print m.definition
| |
| print "\t* ".join(m.examples)
| |
| </source>
| |
|
| |
| === Splitting text into sentences (nltk) ===
| |
|
| |
| <source lang="python">
| |
| from nltk.tokenize import sent_tokenize
| |
| print sent_tokenize("I read J.D. Salinger in High School. He wrote 'Catcher in the Rye'.")
| |
| </source>
| |
|
| |
| ['I read J.D.', 'Salinger in High School.', "He wrote 'Catcher in the Rye'."]
| |
|
| |
| So you can see it's not perfect.
| |
|
| |
| === Removing common words / stopwords (nltk) ===
| |
|
| |
| <source lang="python">
| |
| from nltk.corpus import stopwords
| |
| english_stops = set(stopwords.words("english"))
| |
| words = "Stopwords are common words that are often handy to remove or ignore when processing text".split()
| |
| words = [w for w in words if w not in english_stops]
| |
| print words
| |
| </source>
| |
|
| |
| === Finding capitalized words (regex) ===
| |
|
| |
| <source lang="python">
| |
| import re
| |
| pat = re.compile(r"\b[A-Z]+\b")
| |
| print pat.findall(text)
| |
| </source>
| |
|
| |
| === Extracting parts of an HTML document ===
| |
|
| |
| The html5lib parser is code that turns the source text of an HTML page
| |
| into a structured object, allowing, for instance, to use CSS selectors
| |
| or xpath expressions to select/extract portions of a page
| |
|
| |
| You can use xpath expressions:
| |
|
| |
| <source lang="python">
| |
| import html5lib, lxml
| |
|
| |
| htmlsource="<html><body><p>Example page.</p><p>More stuff.</p></body></html>"
| |
| htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
| |
| page = htmlparser.parse(htmlsource)
| |
| p = page.xpath("/html/body/p[2]")
| |
| if p:
| |
| p = p[0]
| |
| print "".join([t for t in p.itertext()])
| |
| </source>
| |
|
| |
| outputs:
| |
| More stuff.
| |
|
| |
| Also CSS selectors are possible:
| |
|
| |
| <source lang="python">
| |
| import html5lib, lxml, lxml.cssselect
| |
|
| |
| htmlsource="<html><body><p>Example page.</p><p>More stuff.</p></body></html>"
| |
| htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
| |
| page = htmlparser.parse(htmlsource)
| |
| selector = lxml.cssselect.CSSSelector("p")
| |
| for p in selector(page):
| |
| print "-"*20
| |
| print "".join([t for t in p.itertext()])
| |
|
| |
| </source>
| |
|
| |
| --------------------
| |
| Example page.
| |
| --------------------
| |
| More stuff.
| |
|
| |
| === Working with lxml ===
| |
|
| |
| ==== Extracting the text contents of a node (lxml) ====
| |
|
| |
| The itertext method of a node can be useful.
| |
|
| |
| <source lang="python">
| |
| for t in node.itertext():
| |
| print t
| |
| </source>
| |
|
| |
| <source lang="python">
| |
| text = "".join(list(node.itertext()))
| |
| </source>
| |
|
| |
|
| ==== Turning part of a page back into code (aka serialization) (lxml) ==== | | ; TOS selected words frequency in time (by Dusan and Natasa) |
| | [[Goodiff_TOS_word_frequency | source code]] |
| | * [https://spreadsheets.google.com/pub?key=0AgT6KLPteXsOdF84Y0F3RWpxQnQ2ODFOLVA3RG9XWFE&output=html Facebook TOS] |
| | * [https://spreadsheets.google.com/pub?key=0AgT6KLPteXsOdHRuczQxUEU4dWxjWmNjaUtKb2JfM1E&single=true&gid=0&output=html Skype TOS] |
|
| |
|
| Imagine you want to print out the full code of part of a page.
| | ; Simple statistics TOS |
| Use lxml.etree.tostring. This converts any node back into source code -- a process called serialization.
| | * [[16-03-2011 Laura Amy Laurier | process]] |
|
| |
|
| <source lang="python">
| | ; TOS Game |
| htmlsource="<html><body><p>Example page.</p><p>More stuff with <i>markup</i>.</p></body></html>"
| | * [[16-03-2011_Danny_Fabien_Mirjam | Lost in TOS]] |
| htmlparser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
| |
| page = htmlparser.parse(htmlsource)
| |
| selector = lxml.cssselect.CSSSelector("p")
| |
| p = selector(page)[1]
| |
| print lxml.etree.tostring(p)
| |
| </source>
| |