NLTK text analysis
Revision as of 20:16, 21 October 2020 by Naaami (talk | contribs) (→Natural Language Tool Kit_141020_Michael)
Natural Language Tool Kit_141020_Michael
{
"cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "url = \"https://git.xpub.nl/XPUB/S13-Words-for-the-Future-notebooks/raw/branch/master/txt/words-for-the-future/UNDECIDABILITY.txt\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'https://git.xpub.nl/XPUB/S13-Words-for-the-Future-notebooks/raw/branch/master/txt/words-for-the-future/UNDECIDABILITY.txt'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from urllib.request import urlopen" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<http.client.HTTPResponse at 0x7f2f424be6a0>" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "r = urlopen(url)\n", "r" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "rawtext = r.read()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "text = rawtext.decode()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "str" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(text)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "text = urlopen(url).read().decode()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "15990" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(text)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\r'" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text[-1]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'U'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text[0]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "words = text.split()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2548" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(words)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Undecidability'" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "words[0]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Multiplying'" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "words[3]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'158-172.'" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "words[-1]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "from nltk import word_tokenize, Text" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "tokens = word_tokenize(text)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3047" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(tokens)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'.'" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokens[-1]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Undecidability',\n", " 'Silvia',\n", " 'Bottiroli',\n", " 'Multiplying',\n", " 'the',\n", " 'Visible',\n", " 'The',\n", " 'word',\n", " '[',\n", " 'i']" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokens[:10]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Memos', 'for', 'the', 'Next', 'Millennium', '[', 'i', ']', 'written']" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokens[21:30] # not including 30th word" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "strengers = Text(tokens)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<Text: Silvia Bottiroli Multiplying the Visible The word [ i>" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "strengers" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Displaying 11 of 11 matches:\n", " ] attempts to escape the vortex of multiplicity are useless. ” [ 6 ] In his fifth m\n", " , he subsequently focuses on [ i ] multiplicity [ i ] as a way for literature to co\n", "fore , let ’ s think visibility and multiplicity together , as : a multiplication of\n", "n the contrary , it is generating a multiplicity of different gazes that are all leg\n", "ed and thus incomplete and open . A Multiplicity of Gazes An undecidable artwork is \n", "ics today , is that they generate a multiplicity of gazes and of forms of spectators\n", " positions and points of view . The multiplicity of gazes produced and gathered by u\n", "tes a radical collectivity based on multiplicity and on conflicting positions that a\n", "ility and from its encounter with a multiplicity of gazes . Preserving it is possibl\n", "encounter between undecidable art , multiplicity of gazes , and a curatorial dimensi\n", " ibid , p. 98 . 7 . Italo Calvino , Multiplicity , [ i ] Six Memos for the Next Mill\n" ] } ], "source": [ "strengers.concordance(\"multiplicity\", width = 84, lines = 72)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "idability Silvia Bottiroli Multiplying the Visible The word [ i ] undecidable [ i\n", "lvia Bottiroli Multiplying the Visible The word [ i ] undecidable [ i ] appears i\n", "e [ i ] appears in [ i ] Six Memos for the Next Millennium [ i ] written by Italo\n", "ry lectures at Harvard University . In the last months of his life Calvino worked\n", "rishly on these lectures , but died in the process . In the five memos he left be\n", "ectures , but died in the process . In the five memos he left behind , he did not\n", "i ] Visibility [ i ] , revolves around the capacity of literature to generate ima\n", "flow continuously . Calvino focuses on the imagination as “ the repertory of what\n", "alvino focuses on the imagination as “ the repertory of what is potential ; what \n", " exist but might have existed. ” [ 2 ] The main concern that he brings forth lies\n", "ncern that he brings forth lies within the relation between contemporary culture \n", "contemporary culture and imagination : the risk to definitely lose , in the overp\n", "ion : the risk to definitely lose , in the overproduction of images , the power o\n", "se , in the overproduction of images , the power of bringing visions into focus w\n", "g [ i ] in terms of images. ” [ 3 ] In the last pages of the lecture , he propose\n", "f images. ” [ 3 ] In the last pages of the lecture , he proposes a shift from und\n", "he proposes a shift from understanding the fantastic world of the artist , not as\n", "m understanding the fantastic world of the artist , not as indefinable , but as [\n", "th this word , Calvino means to define the coexistence and the relation , within \n", "no means to define the coexistence and the relation , within any literary work , \n", ", between three different dimensions . The first dimension is the artist ’ s imag\n", "nt dimensions . The first dimension is the artist ’ s imagination – a world of po\n", "at no work will succeed in realizing . The second is the reality as we experience\n", "l succeed in realizing . The second is the reality as we experience it by living \n", "we experience it by living . Finally , the third is the world of the actual work \n", " it by living . Finally , the third is the world of the actual work , made by the\n", " . Finally , the third is the world of the actual work , made by the layers of si\n", "the world of the actual work , made by the layers of signs that accumulate in it \n", "ns that accumulate in it ; compared to the first two worlds , it is “ also infini\n", "ctory to formulation. ” [ 4 ] He calls the link between these three worlds “ the \n", " the link between these three worlds “ the undecidable , the paradox of an infini\n", "these three worlds “ the undecidable , the paradox of an infinite whole that cont\n", "ino , artistic operations involve , by the means of the infinity of linguistic po\n", "c operations involve , by the means of the infinity of linguistic possibilities ,\n", "infinity of linguistic possibilities , the infinity of the artist ’ s imagination\n", "uistic possibilities , the infinity of the artist ’ s imagination , and the infin\n", "ty of the artist ’ s imagination , and the infinity of contingencies . Therefore \n", "ity of contingencies . Therefore , “ [ the ] attempts to escape the vortex of mul\n", "erefore , “ [ the ] attempts to escape the vortex of multiplicity are useless. ” \n", " as a way for literature to comprehend the complex nature of the world that for t\n", "re to comprehend the complex nature of the world that for the author is a whole o\n", "e complex nature of the world that for the author is a whole of wholes , where th\n", "he author is a whole of wholes , where the acts of watching and knowing also inte\n", "watching and knowing also intervene in the observed reality and alter it . Calvin\n", "are readable as different narratives . The lecture revolves around some novels th\n", "ain multiple worlds and make space for the readers ’ imaginations . The common so\n", "space for the readers ’ imaginations . The common source to all these experiments\n", "all these experiments seems to rely in the understanding of the contemporary nove\n", " seems to rely in the understanding of the contemporary novel “ as an encyclopedi\n", " , as a network of connections between the events , the people , and the things o\n", "rk of connections between the events , the people , and the things of the world. \n", " between the events , the people , and the things of the world. ” [ 7 ] Therefore\n", "vents , the people , and the things of the world. ” [ 7 ] Therefore , let ’ s thi\n", "ic production and define a context for the undecidable , or rather for undecidabi\n", "le , or rather for undecidability , as the quality of being undecidable . Calvino\n", "tion modes and doesn ’ t fade out from the scene of the ‘ real ’ world . We might\n", "d doesn ’ t fade out from the scene of the ‘ real ’ world . We might stretch this\n", " s potentiality is that of multiplying the visible as an actual counterstrategy t\n", "isible as an actual counterstrategy to the proliferation of images that surrounds\n", "ly articulates , redefines , or alters the complex system of links , bounds , and\n", "specific to some artworks within which the three worlds that Calvino describes me\n", "tains and under certain terms performs the possibility of its actualisation , a w\n", "into one actual form . In particular , the potentiality generated by undecidable \n", "c of ‘ and… and… and… ’ as opposite to the logic of ‘ either… or… ’ that seems to\n", "ature and just exist as such . None of the images of an artwork are being more or\n", "twork are being more or less real than the others , no matter whether they come a\n", "vidual or collective fantasies . It is the art ( work ) as such that creates a gr\n", "s such that creates a ground where all the images that come into visibility share\n", "images that come into visibility share the same gradient of reality , no matter w\n", "itors or spectators to enter into – if the invitation of art is often that of los\n", "itation of art is often that of losing the contact with known worlds in order to \n", "Here , spectators are invited to enter the work ’ s fictional world carrying with\n", "ctional world carrying with themselves the so-called real world and all their oth\n", "ll these worlds are equally welcomed . The artwork may then be navigated either b\n" ] } ], "source": [ "for line in strengers.concordance_list(\"the\", width=82, lines=74):\n", " print (line.left_print, line.query, line.right_print)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Undecidability\n", "University\n", "visibility\n", "Visibility\n", "capacity\n", "reality\n", "infinity\n", "infinity\n", "infinity\n", "multiplicity\n", "multiplicity\n", "reality\n", "visibility\n", "multiplicity\n", "undecidability\n", "quality\n", "potentiality\n", "visibility\n", "undecidability\n", "undecidability\n", "quality\n", "possibility\n", "potentiality\n", "potentiality\n", "reality\n", "reality\n", "visibility\n", "reality\n", "undecidability\n", "reality\n", "contemporaneity\n", "possibility\n", "possibility\n", "possibility\n", "undecidability\n", "community\n", "possibility\n", "multiplicity\n", "Multiplicity\n", "multiplicity\n", "multiplicity\n", "community\n", "collectivity\n", "multiplicity\n", "reality\n", "responsibility\n", "undecidability\n", "potentiality\n", "undecidability\n", "collectivity\n", "visibility\n", "Undecidability\n", "possibility\n", "potentiality\n", "quality\n", "undecidability\n", "multiplicity\n", "intensity\n", "multiplicity\n", "Visibility\n", "University\n", "Multiplicity\n", "University\n" ] } ], "source": [ "for w in strengers:\n", " if w.endswith(\"ity\"):\n", " print (w) # but then this will show overlapping, looping.." ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "# and now collected in a list, and squashing case, and using a \"set\" to remove dupliates.\n", "\n", "ity = []\n", "for w in strengers :\n", " if w.endswith(\"ity\"):\n", " #print(w)\n", " ity.append(w.lower())\n", " #strengers.concordance()\n", "ity = set(ity) \n", "\n" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "with open(\"nami_undecidibility_Michael_NLTK_141020.text\", \"w\") as output:\n", "\n", " s = 0\n", "\n", " for word in ity:\n", " #strengers.concordance(word, width = 84)\n", " for line in strengers.concordance_list(word, width=82, lines=74):\n", " t = line.left_print + \" \" * (2 + int(s)) + line.query + \" \" * (2 + int(s)) + line.right_print \n", " #print(s)\n", " print (t[:82], file = output)#0-82 limited\n", " s = s + 0.3\n", " \n", " \n", " \n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for w in strengers:\n", " if w.endswith(\"le\"):\n", " print (w) # but then this will show overlapping, looping.." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# and now collected in a list, and squashing case, and using a set to remove dupliates\n", "\n", "le = []\n", "for w in strengers :\n", " if w.endswith(\"le\"):\n", " #print(w)\n", " le.append(w.lower())\n", " #strengers.concordance()\n", "le = set(le) \n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for word in le:\n", " strengers.concordance(word, width = 84)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Displaying 11 of 11 matches:\n", "cape the vortex of multiplicity are useless. ” [ 6\n", "y focuses on [ i ] multiplicity [ i ] as a way for\n", "ink visibility and multiplicity together , as : a \n", "it is generating a multiplicity of different gazes\n", "plete and open . A Multiplicity of Gazes An undeci\n", "at they generate a multiplicity of gazes and of fo\n", "ints of view . The multiplicity of gazes produced \n", "lectivity based on multiplicity and on conflicting\n", "s encounter with a multiplicity of gazes . Preserv\n", " undecidable art , multiplicity of gazes , and a c\n", " . Italo Calvino , Multiplicity , [ i ] Six Memos \n" ] } ], "source": [ "strengers.concordance(\"multiplicity\", width = 50)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kind world logic space practice undecidable visibility capacity images\n", "and repertory overproduction power thinking understanding means\n", "coexistence layers paradox whole\n" ] } ], "source": [ "strengers.similar(\"multiplicity\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "strengers.common_contexts([\"undecidability\", \"multiplicity\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "strengers.dispersion_plot([\"the\", \"multiplicity\", \"performance\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.core.pylabtools import figsize" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fizsize(20.0, 20.0) #make the graph more longe" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from nltk.probability import FreqDist" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "freq = FreqDist(tokens) # frequency distribution" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "freq.keys()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "freq[\"the\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "freq.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "freq.plot(50, cumulative = True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "freq.plot(30)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4
}