Repository where I mostly put random python scripts.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

284 lines
7.5 KiB

  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "import gensim\n"
  10. ]
  11. },
  12. {
  13. "cell_type": "code",
  14. "execution_count": 4,
  15. "metadata": {},
  16. "outputs": [],
  17. "source": [
  18. "model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) "
  19. ]
  20. },
  21. {
  22. "cell_type": "code",
  23. "execution_count": 6,
  24. "metadata": {},
  25. "outputs": [
  26. {
  27. "data": {
  28. "text/plain": [
  29. "[('hi', 0.654898464679718),\n",
  30. " ('goodbye', 0.639905571937561),\n",
  31. " ('howdy', 0.6310957074165344),\n",
  32. " ('goodnight', 0.5920578241348267),\n",
  33. " ('greeting', 0.5855878591537476),\n",
  34. " ('Hello', 0.5842196941375732),\n",
  35. " (\"g'day\", 0.5754077434539795),\n",
  36. " ('See_ya', 0.5688871145248413),\n",
  37. " ('ya_doin', 0.5643119812011719),\n",
  38. " ('greet', 0.5636603832244873)]"
  39. ]
  40. },
  41. "execution_count": 6,
  42. "metadata": {},
  43. "output_type": "execute_result"
  44. }
  45. ],
  46. "source": [
  47. "model.most_similar(\"hello\")"
  48. ]
  49. },
  50. {
  51. "cell_type": "code",
  52. "execution_count": 8,
  53. "metadata": {},
  54. "outputs": [
  55. {
  56. "data": {
  57. "text/plain": [
  58. "[('coders', 0.6104331612586975),\n",
  59. " ('coder', 0.6063331365585327),\n",
  60. " ('Coding', 0.5804804563522339),\n",
  61. " ('formatting', 0.5671651363372803),\n",
  62. " ('soluble_receptors', 0.5576372146606445),\n",
  63. " ('ICD9', 0.5571348667144775),\n",
  64. " ('refactoring', 0.5495434999465942),\n",
  65. " ('database_schemas', 0.5372464656829834),\n",
  66. " ('recode', 0.534299373626709),\n",
  67. " ('XHTML_CSS', 0.5328801870346069)]"
  68. ]
  69. },
  70. "execution_count": 8,
  71. "metadata": {},
  72. "output_type": "execute_result"
  73. }
  74. ],
  75. "source": [
  76. "model.most_similar(\"coding\")"
  77. ]
  78. },
  79. {
  80. "cell_type": "code",
  81. "execution_count": 9,
  82. "metadata": {},
  83. "outputs": [
  84. {
  85. "data": {
  86. "text/plain": [
  87. "[('cats', 0.8099379539489746),\n",
  88. " ('dog', 0.7609456777572632),\n",
  89. " ('kitten', 0.7464985251426697),\n",
  90. " ('feline', 0.7326233983039856),\n",
  91. " ('beagle', 0.7150583267211914),\n",
  92. " ('puppy', 0.7075453996658325),\n",
  93. " ('pup', 0.6934291124343872),\n",
  94. " ('pet', 0.6891531348228455),\n",
  95. " ('felines', 0.6755931377410889),\n",
  96. " ('chihuahua', 0.6709762215614319)]"
  97. ]
  98. },
  99. "execution_count": 9,
  100. "metadata": {},
  101. "output_type": "execute_result"
  102. }
  103. ],
  104. "source": [
  105. "model.most_similar(\"cat\")"
  106. ]
  107. },
  108. {
  109. "cell_type": "code",
  110. "execution_count": 16,
  111. "metadata": {},
  112. "outputs": [
  113. {
  114. "name": "stdout",
  115. "output_type": "stream",
  116. "text": [
  117. "hi globe \n"
  118. ]
  119. }
  120. ],
  121. "source": [
  122. "def transformSentence(sentence):\n",
  123. " outputSentence = \"\"\n",
  124. " \n",
  125. " for word in sentence.split(\" \"):\n",
  126. " try:\n",
  127. " outputSentence += model.most_similar(word)[0][0] + \" \"\n",
  128. " except Exception:\n",
  129. " outputSentence += word + \" \"\n",
  130. " return outputSentence\n",
  131. "\n",
  132. "print(transformSentence(\"hello world\"))"
  133. ]
  134. },
  135. {
  136. "cell_type": "code",
  137. "execution_count": 12,
  138. "metadata": {},
  139. "outputs": [
  140. {
  141. "name": "stdout",
  142. "output_type": "stream",
  143. "text": [
  144. "looks Mom No hand \n"
  145. ]
  146. }
  147. ],
  148. "source": [
  149. "print(transformSentence(\"look mom no hands\"))"
  150. ]
  151. },
  152. {
  153. "cell_type": "code",
  154. "execution_count": 17,
  155. "metadata": {},
  156. "outputs": [
  157. {
  158. "name": "stdout",
  159. "output_type": "stream",
  160. "text": [
  161. "This gen_eral concept of Clustering was to groups Data wtih similiar trait \n"
  162. ]
  163. }
  164. ],
  165. "source": [
  166. "print(transformSentence(\"The general idea of clustering is to group data with similar traits\"))"
  167. ]
  168. },
  169. {
  170. "cell_type": "code",
  171. "execution_count": 52,
  172. "metadata": {},
  173. "outputs": [
  174. {
  175. "name": "stdout",
  176. "output_type": "stream",
  177. "text": [
  178. "This manager concept of clusters was to groups datasets wtih similiar traits. \n"
  179. ]
  180. }
  181. ],
  182. "source": [
  183. "def removeFromString(string, chars):\n",
  184. " for c in chars:\n",
  185. " string = string.replace(c, \"\")\n",
  186. " return string\n",
  187. "\n",
  188. "\n",
  189. "def transformSentenceWithHeuristic(sentence):\n",
  190. " outputSentence = \"\"\n",
  191. " \n",
  192. " for word in sentence.split(\" \"):\n",
  193. " try:\n",
  194. " changed = False\n",
  195. " for w, _ in model.most_similar(word):\n",
  196. " clean = removeFromString(w, [' ', '_']).lower()\n",
  197. " if clean not in word.lower() and \"_\" not in w:\n",
  198. " outputSentence += w + \" \"\n",
  199. " changed = True\n",
  200. " break\n",
  201. " outputSentence = outputSentence if changed else outputSentence + word + \" \"\n",
  202. " except Exception:\n",
  203. " outputSentence += word + \" \"\n",
  204. " return outputSentence\n",
  205. "print(transformSentenceWithHeuristic(\"The general idea of clustering is to group data with similar traits.\"))"
  206. ]
  207. },
  208. {
  209. "cell_type": "code",
  210. "execution_count": 53,
  211. "metadata": {},
  212. "outputs": [
  213. {
  214. "name": "stdout",
  215. "output_type": "stream",
  216. "text": [
  217. "Relax up and grabbing a drinks but that was day it I talking abut this hallucinogenic trips it was this 1981 film Fever Treatment. \n"
  218. ]
  219. }
  220. ],
  221. "source": [
  222. "print(transformSentenceWithHeuristic(\"Sit down and grab a drink because it is time that we talk about the LSD trip that is the 1981 movie Shock Treatment.\"))"
  223. ]
  224. },
  225. {
  226. "cell_type": "code",
  227. "execution_count": 54,
  228. "metadata": {},
  229. "outputs": [],
  230. "source": [
  231. "from sklearn.decomposition import IncrementalPCA # inital reduction\n",
  232. "from sklearn.manifold import TSNE # final reduction\n",
  233. "import numpy as np # array handling\n",
  234. "\n",
  235. "\n",
  236. "def reduce_dimensions(model):\n",
  237. " num_dimensions = 2 # final num dimensions (2D, 3D, etc)\n",
  238. "\n",
  239. " vectors = [] # positions in vector space\n",
  240. " labels = [] # keep track of words to label our data again later\n",
  241. " for word in model.wv.vocab:\n",
  242. " vectors.append(model.wv[word])\n",
  243. " labels.append(word)\n",
  244. "\n",
  245. " # convert both lists into numpy vectors for reduction\n",
  246. " vectors = np.asarray(vectors)\n",
  247. " labels = np.asarray(labels)\n",
  248. "\n",
  249. " # reduce using t-SNE\n",
  250. " vectors = np.asarray(vectors)\n",
  251. " tsne = TSNE(n_components=num_dimensions, random_state=0)\n",
  252. " vectors = tsne.fit_transform(vectors)\n",
  253. "\n",
  254. " x_vals = [v[0] for v in vectors]\n",
  255. " y_vals = [v[1] for v in vectors]\n",
  256. " return x_vals, y_vals, labels\n",
  257. "\n",
  258. "\n",
  259. "#x_vals, y_vals, labels = reduce_dimensions(model)"
  260. ]
  261. }
  262. ],
  263. "metadata": {
  264. "kernelspec": {
  265. "display_name": "Python 3",
  266. "language": "python",
  267. "name": "python3"
  268. },
  269. "language_info": {
  270. "codemirror_mode": {
  271. "name": "ipython",
  272. "version": 3
  273. },
  274. "file_extension": ".py",
  275. "mimetype": "text/x-python",
  276. "name": "python",
  277. "nbconvert_exporter": "python",
  278. "pygments_lexer": "ipython3",
  279. "version": "3.8.1"
  280. }
  281. },
  282. "nbformat": 4,
  283. "nbformat_minor": 4
  284. }