NLTK text analysis

From XPUB & Lens-Based wiki

Natural Language Tool Kit_141020_Michael

{

"cells": [
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": []
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": []
 },
 {
  "cell_type": "code",
  "execution_count": 1,
  "metadata": {},
  "outputs": [],
  "source": [
   "url = \"https://git.xpub.nl/XPUB/S13-Words-for-the-Future-notebooks/raw/branch/master/txt/words-for-the-future/UNDECIDABILITY.txt\""
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 2,
  "metadata": {},
  "outputs": [
   {
    "data": {
     "text/plain": [
      "'https://git.xpub.nl/XPUB/S13-Words-for-the-Future-notebooks/raw/branch/master/txt/words-for-the-future/UNDECIDABILITY.txt'"
     ]
    },
    "execution_count": 2,
    "metadata": {},
    "output_type": "execute_result"
   }
  ],
  "source": [
   "url"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 3,
  "metadata": {},
  "outputs": [],
  "source": [
   "from urllib.request import urlopen"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 4,
  "metadata": {},
  "outputs": [
   {
    "data": {
     "text/plain": [
      "<http.client.HTTPResponse at 0x7f2f424be6a0>"
     ]
    },
    "execution_count": 4,
    "metadata": {},
    "output_type": "execute_result"
   }
  ],
  "source": [
   "r = urlopen(url)\n",
   "r"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 5,
  "metadata": {},
  "outputs": [],
  "source": [
   "rawtext = r.read()"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 6,
  "metadata": {},
  "outputs": [],
  "source": [
   "text = rawtext.decode()"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 7,
  "metadata": {},
  "outputs": [
   {
    "data": {
     "text/plain": [
      "str"
     ]
    },
    "execution_count": 7,
    "metadata": {},
    "output_type": "execute_result"
   }
  ],
  "source": [
   "type(text)"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 8,
  "metadata": {},
  "outputs": [],
  "source": [
   "text = urlopen(url).read().decode()"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 9,
  "metadata": {},
  "outputs": [
   {
    "data": {
     "text/plain": [
      "15990"
     ]
    },
    "execution_count": 9,
    "metadata": {},
    "output_type": "execute_result"
   }
  ],
  "source": [
   "len(text)"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 10,
  "metadata": {},
  "outputs": [
   {
    "data": {
     "text/plain": [
      "'\\r'"
     ]
    },
    "execution_count": 10,
    "metadata": {},
    "output_type": "execute_result"
   }
  ],
  "source": [
   "text[-1]"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 11,
  "metadata": {},
  "outputs": [
   {
    "data": {
     "text/plain": [
      "'U'"
     ]
    },
    "execution_count": 11,
    "metadata": {},
    "output_type": "execute_result"
   }
  ],
  "source": [
   "text[0]"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 33,
  "metadata": {},
  "outputs": [],
  "source": [
   "words = text.split()"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 13,
  "metadata": {},
  "outputs": [
   {
    "data": {
     "text/plain": [
      "2548"
     ]
    },
    "execution_count": 13,
    "metadata": {},
    "output_type": "execute_result"
   }
  ],
  "source": [
   "len(words)"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 14,
  "metadata": {},
  "outputs": [
   {
    "data": {
     "text/plain": [
      "'Undecidability'"
     ]
    },
    "execution_count": 14,
    "metadata": {},
    "output_type": "execute_result"
   }
  ],
  "source": [
   "words[0]"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 15,
  "metadata": {},
  "outputs": [
   {
    "data": {
     "text/plain": [
      "'Multiplying'"
     ]
    },
    "execution_count": 15,
    "metadata": {},
    "output_type": "execute_result"
   }
  ],
  "source": [
   "words[3]"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 16,
  "metadata": {},
  "outputs": [
   {
    "data": {
     "text/plain": [
      "'158-172.'"
     ]
    },
    "execution_count": 16,
    "metadata": {},
    "output_type": "execute_result"
   }
  ],
  "source": [
   "words[-1]"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 17,
  "metadata": {},
  "outputs": [],
  "source": [
   "from nltk import word_tokenize, Text"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 18,
  "metadata": {},
  "outputs": [],
  "source": [
   "tokens = word_tokenize(text)"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 19,
  "metadata": {},
  "outputs": [
   {
    "data": {
     "text/plain": [
      "3047"
     ]
    },
    "execution_count": 19,
    "metadata": {},
    "output_type": "execute_result"
   }
  ],
  "source": [
   "len(tokens)"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 20,
  "metadata": {},
  "outputs": [
   {
    "data": {
     "text/plain": [
      "'.'"
     ]
    },
    "execution_count": 20,
    "metadata": {},
    "output_type": "execute_result"
   }
  ],
  "source": [
   "tokens[-1]"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 21,
  "metadata": {},
  "outputs": [
   {
    "data": {
     "text/plain": [
      "['Undecidability',\n",
      " 'Silvia',\n",
      " 'Bottiroli',\n",
      " 'Multiplying',\n",
      " 'the',\n",
      " 'Visible',\n",
      " 'The',\n",
      " 'word',\n",
      " '[',\n",
      " 'i']"
     ]
    },
    "execution_count": 21,
    "metadata": {},
    "output_type": "execute_result"
   }
  ],
  "source": [
   "tokens[:10]"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 22,
  "metadata": {},
  "outputs": [
   {
    "data": {
     "text/plain": [
      "['Memos', 'for', 'the', 'Next', 'Millennium', '[', 'i', ']', 'written']"
     ]
    },
    "execution_count": 22,
    "metadata": {},
    "output_type": "execute_result"
   }
  ],
  "source": [
   "tokens[21:30] # not including 30th word"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 23,
  "metadata": {},
  "outputs": [],
  "source": [
   "strengers = Text(tokens)"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 38,
  "metadata": {},
  "outputs": [
   {
    "data": {
     "text/plain": [
      "<Text: Silvia Bottiroli Multiplying the Visible The word [ i>"
     ]
    },
    "execution_count": 38,
    "metadata": {},
    "output_type": "execute_result"
   }
  ],
  "source": [
   "strengers"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 40,
  "metadata": {},
  "outputs": [
   {
    "name": "stdout",
    "output_type": "stream",
    "text": [
     "Displaying 11 of 11 matches:\n",
     " ] attempts to escape the vortex of multiplicity are useless. ” [ 6 ] In his fifth m\n",
     " , he subsequently focuses on [ i ] multiplicity [ i ] as a way for literature to co\n",
     "fore , let ’ s think visibility and multiplicity together , as : a multiplication of\n",
     "n the contrary , it is generating a multiplicity of different gazes that are all leg\n",
     "ed and thus incomplete and open . A Multiplicity of Gazes An undecidable artwork is \n",
     "ics today , is that they generate a multiplicity of gazes and of forms of spectators\n",
     " positions and points of view . The multiplicity of gazes produced and gathered by u\n",
     "tes a radical collectivity based on multiplicity and on conflicting positions that a\n",
     "ility and from its encounter with a multiplicity of gazes . Preserving it is possibl\n",
     "encounter between undecidable art , multiplicity of gazes , and a curatorial dimensi\n",
     " ibid , p. 98 . 7 . Italo Calvino , Multiplicity , [ i ] Six Memos for the Next Mill\n"
    ]
   }
  ],
  "source": [
   "strengers.concordance(\"multiplicity\", width = 84, lines = 72)"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 26,
  "metadata": {},
  "outputs": [
   {
    "name": "stdout",
    "output_type": "stream",
    "text": [
     "idability Silvia Bottiroli Multiplying the Visible The word [ i ] undecidable [ i\n",
     "lvia Bottiroli Multiplying the Visible The word [ i ] undecidable [ i ] appears i\n",
     "e [ i ] appears in [ i ] Six Memos for the Next Millennium [ i ] written by Italo\n",
     "ry lectures at Harvard University . In the last months of his life Calvino worked\n",
     "rishly on these lectures , but died in the process . In the five memos he left be\n",
     "ectures , but died in the process . In the five memos he left behind , he did not\n",
     "i ] Visibility [ i ] , revolves around the capacity of literature to generate ima\n",
     "flow continuously . Calvino focuses on the imagination as “ the repertory of what\n",
     "alvino focuses on the imagination as “ the repertory of what is potential ; what \n",
     " exist but might have existed. ” [ 2 ] The main concern that he brings forth lies\n",
     "ncern that he brings forth lies within the relation between contemporary culture \n",
     "contemporary culture and imagination : the risk to definitely lose , in the overp\n",
     "ion : the risk to definitely lose , in the overproduction of images , the power o\n",
     "se , in the overproduction of images , the power of bringing visions into focus w\n",
     "g [ i ] in terms of images. ” [ 3 ] In the last pages of the lecture , he propose\n",
     "f images. ” [ 3 ] In the last pages of the lecture , he proposes a shift from und\n",
     "he proposes a shift from understanding the fantastic world of the artist , not as\n",
     "m understanding the fantastic world of the artist , not as indefinable , but as [\n",
     "th this word , Calvino means to define the coexistence and the relation , within \n",
     "no means to define the coexistence and the relation , within any literary work , \n",
     ", between three different dimensions . The first dimension is the artist ’ s imag\n",
     "nt dimensions . The first dimension is the artist ’ s imagination – a world of po\n",
     "at no work will succeed in realizing . The second is the reality as we experience\n",
     "l succeed in realizing . The second is the reality as we experience it by living \n",
     "we experience it by living . Finally , the third is the world of the actual work \n",
     " it by living . Finally , the third is the world of the actual work , made by the\n",
     " . Finally , the third is the world of the actual work , made by the layers of si\n",
     "the world of the actual work , made by the layers of signs that accumulate in it \n",
     "ns that accumulate in it ; compared to the first two worlds , it is “ also infini\n",
     "ctory to formulation. ” [ 4 ] He calls the link between these three worlds “ the \n",
     " the link between these three worlds “ the undecidable , the paradox of an infini\n",
     "these three worlds “ the undecidable , the paradox of an infinite whole that cont\n",
     "ino , artistic operations involve , by the means of the infinity of linguistic po\n",
     "c operations involve , by the means of the infinity of linguistic possibilities ,\n",
     "infinity of linguistic possibilities , the infinity of the artist ’ s imagination\n",
     "uistic possibilities , the infinity of the artist ’ s imagination , and the infin\n",
     "ty of the artist ’ s imagination , and the infinity of contingencies . Therefore \n",
     "ity of contingencies . Therefore , “ [ the ] attempts to escape the vortex of mul\n",
     "erefore , “ [ the ] attempts to escape the vortex of multiplicity are useless. ” \n",
     " as a way for literature to comprehend the complex nature of the world that for t\n",
     "re to comprehend the complex nature of the world that for the author is a whole o\n",
     "e complex nature of the world that for the author is a whole of wholes , where th\n",
     "he author is a whole of wholes , where the acts of watching and knowing also inte\n",
     "watching and knowing also intervene in the observed reality and alter it . Calvin\n",
     "are readable as different narratives . The lecture revolves around some novels th\n",
     "ain multiple worlds and make space for the readers ’ imaginations . The common so\n",
     "space for the readers ’ imaginations . The common source to all these experiments\n",
     "all these experiments seems to rely in the understanding of the contemporary nove\n",
     " seems to rely in the understanding of the contemporary novel “ as an encyclopedi\n",
     " , as a network of connections between the events , the people , and the things o\n",
     "rk of connections between the events , the people , and the things of the world. \n",
     " between the events , the people , and the things of the world. ” [ 7 ] Therefore\n",
     "vents , the people , and the things of the world. ” [ 7 ] Therefore , let ’ s thi\n",
     "ic production and define a context for the undecidable , or rather for undecidabi\n",
     "le , or rather for undecidability , as the quality of being undecidable . Calvino\n",
     "tion modes and doesn ’ t fade out from the scene of the ‘ real ’ world . We might\n",
     "d doesn ’ t fade out from the scene of the ‘ real ’ world . We might stretch this\n",
     " s potentiality is that of multiplying the visible as an actual counterstrategy t\n",
     "isible as an actual counterstrategy to the proliferation of images that surrounds\n",
     "ly articulates , redefines , or alters the complex system of links , bounds , and\n",
     "specific to some artworks within which the three worlds that Calvino describes me\n",
     "tains and under certain terms performs the possibility of its actualisation , a w\n",
     "into one actual form . In particular , the potentiality generated by undecidable \n",
     "c of ‘ and… and… and… ’ as opposite to the logic of ‘ either… or… ’ that seems to\n",
     "ature and just exist as such . None of the images of an artwork are being more or\n",
     "twork are being more or less real than the others , no matter whether they come a\n",
     "vidual or collective fantasies . It is the art ( work ) as such that creates a gr\n",
     "s such that creates a ground where all the images that come into visibility share\n",
     "images that come into visibility share the same gradient of reality , no matter w\n",
     "itors or spectators to enter into – if the invitation of art is often that of los\n",
     "itation of art is often that of losing the contact with known worlds in order to \n",
     "Here , spectators are invited to enter the work ’ s fictional world carrying with\n",
     "ctional world carrying with themselves the so-called real world and all their oth\n",
     "ll these worlds are equally welcomed . The artwork may then be navigated either b\n"
    ]
   }
  ],
  "source": [
   "for line in strengers.concordance_list(\"the\", width=82, lines=74):\n",
   "    print (line.left_print, line.query, line.right_print)"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 27,
  "metadata": {},
  "outputs": [
   {
    "name": "stdout",
    "output_type": "stream",
    "text": [
     "Undecidability\n",
     "University\n",
     "visibility\n",
     "Visibility\n",
     "capacity\n",
     "reality\n",
     "infinity\n",
     "infinity\n",
     "infinity\n",
     "multiplicity\n",
     "multiplicity\n",
     "reality\n",
     "visibility\n",
     "multiplicity\n",
     "undecidability\n",
     "quality\n",
     "potentiality\n",
     "visibility\n",
     "undecidability\n",
     "undecidability\n",
     "quality\n",
     "possibility\n",
     "potentiality\n",
     "potentiality\n",
     "reality\n",
     "reality\n",
     "visibility\n",
     "reality\n",
     "undecidability\n",
     "reality\n",
     "contemporaneity\n",
     "possibility\n",
     "possibility\n",
     "possibility\n",
     "undecidability\n",
     "community\n",
     "possibility\n",
     "multiplicity\n",
     "Multiplicity\n",
     "multiplicity\n",
     "multiplicity\n",
     "community\n",
     "collectivity\n",
     "multiplicity\n",
     "reality\n",
     "responsibility\n",
     "undecidability\n",
     "potentiality\n",
     "undecidability\n",
     "collectivity\n",
     "visibility\n",
     "Undecidability\n",
     "possibility\n",
     "potentiality\n",
     "quality\n",
     "undecidability\n",
     "multiplicity\n",
     "intensity\n",
     "multiplicity\n",
     "Visibility\n",
     "University\n",
     "Multiplicity\n",
     "University\n"
    ]
   }
  ],
  "source": [
   "for w in strengers:\n",
   "    if w.endswith(\"ity\"):\n",
   "        print (w) # but then this will show overlapping, looping.."
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 41,
  "metadata": {},
  "outputs": [],
  "source": [
   "# and now collected in a list, and squashing case, and using a \"set\" to remove dupliates.\n",
   "\n",
   "ity = []\n",
   "for w in strengers :\n",
   "    if w.endswith(\"ity\"):\n",
   "        #print(w)\n",
   "        ity.append(w.lower())\n",
   "        #strengers.concordance()\n",
   "ity = set(ity)        \n",
   "\n"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 43,
  "metadata": {},
  "outputs": [],
  "source": [
   "with open(\"nami_undecidibility_Michael_NLTK_141020.text\", \"w\") as output:\n",
   "\n",
   "    s = 0\n",
   "\n",
   "    for word in ity:\n",
   "        #strengers.concordance(word, width = 84)\n",
   "        for line in strengers.concordance_list(word, width=82, lines=74):\n",
   "            t = line.left_print + \" \" * (2 + int(s)) + line.query + \" \" * (2 + int(s)) + line.right_print \n",
   "            #print(s)\n",
   "            print (t[:82], file = output)#0-82 limited\n",
   "            s = s + 0.3\n",
   "   \n",
   "        \n",
   "        \n",
   "        "
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": []
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
   "for w in strengers:\n",
   "    if w.endswith(\"le\"):\n",
   "        print (w) # but then this will show overlapping, looping.."
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
   "# and now collected in a list, and squashing case, and using a set to remove dupliates\n",
   "\n",
   "le = []\n",
   "for w in strengers :\n",
   "    if w.endswith(\"le\"):\n",
   "        #print(w)\n",
   "        le.append(w.lower())\n",
   "        #strengers.concordance()\n",
   "le = set(le)        \n",
   "\n"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
   "for word in le:\n",
   "        strengers.concordance(word, width = 84)"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": []
 },
 {
  "cell_type": "code",
  "execution_count": 45,
  "metadata": {},
  "outputs": [
   {
    "name": "stdout",
    "output_type": "stream",
    "text": [
     "Displaying 11 of 11 matches:\n",
     "cape the vortex of multiplicity are useless. ” [ 6\n",
     "y focuses on [ i ] multiplicity [ i ] as a way for\n",
     "ink visibility and multiplicity together , as : a \n",
     "it is generating a multiplicity of different gazes\n",
     "plete and open . A Multiplicity of Gazes An undeci\n",
     "at they generate a multiplicity of gazes and of fo\n",
     "ints of view . The multiplicity of gazes produced \n",
     "lectivity based on multiplicity and on conflicting\n",
     "s encounter with a multiplicity of gazes . Preserv\n",
     " undecidable art , multiplicity of gazes , and a c\n",
     " . Italo Calvino , Multiplicity , [ i ] Six Memos \n"
    ]
   }
  ],
  "source": [
   "strengers.concordance(\"multiplicity\", width = 50)"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": 44,
  "metadata": {},
  "outputs": [
   {
    "name": "stdout",
    "output_type": "stream",
    "text": [
     "kind world logic space practice undecidable visibility capacity images\n",
     "and repertory overproduction power thinking understanding means\n",
     "coexistence layers paradox whole\n"
    ]
   }
  ],
  "source": [
   "strengers.similar(\"multiplicity\")"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
   "strengers.common_contexts([\"undecidability\", \"multiplicity\"])"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
   "strengers.dispersion_plot([\"the\", \"multiplicity\", \"performance\"])"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
   "from IPython.core.pylabtools import figsize"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
   "fizsize(20.0, 20.0) #make the graph more longe"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
   "from nltk.probability import FreqDist"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
   "freq = FreqDist(tokens) # frequency distribution"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
   "freq.keys()"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
   "freq[\"the\"]"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
   "freq.plot()"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
   "freq.plot(50, cumulative = True)"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": [
   "freq.plot(30)"
  ]
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": []
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": []
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": []
 },
 {
  "cell_type": "code",
  "execution_count": null,
  "metadata": {},
  "outputs": [],
  "source": []
 }
],
"metadata": {
 "kernelspec": {
  "display_name": "Python 3",
  "language": "python",
  "name": "python3"
 },
 "language_info": {
  "codemirror_mode": {
   "name": "ipython",
   "version": 3
  },
  "file_extension": ".py",
  "mimetype": "text/x-python",
  "name": "python",
  "nbconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "version": "3.7.3"
 }
},
"nbformat": 4,
"nbformat_minor": 4

}