jrtechs
/
jrtechs-RandomScripts
mirror of https://github.com/jrtechs/RandomScripts.git


								{

								 "cells": [

								  {

								   "cell_type": "code",

								   "execution_count": 1,

								   "metadata": {},

								   "outputs": [],

								   "source": [

								    "import gensim\n"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 4,

								   "metadata": {},

								   "outputs": [],

								   "source": [

								    "model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) "

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 6,

								   "metadata": {},

								   "outputs": [

								    {

								     "data": {

								      "text/plain": [

								       "[('hi', 0.654898464679718),\n",

								       " ('goodbye', 0.639905571937561),\n",

								       " ('howdy', 0.6310957074165344),\n",

								       " ('goodnight', 0.5920578241348267),\n",

								       " ('greeting', 0.5855878591537476),\n",

								       " ('Hello', 0.5842196941375732),\n",

								       " (\"g'day\", 0.5754077434539795),\n",

								       " ('See_ya', 0.5688871145248413),\n",

								       " ('ya_doin', 0.5643119812011719),\n",

								       " ('greet', 0.5636603832244873)]"

								      ]

								     },

								     "execution_count": 6,

								     "metadata": {},

								     "output_type": "execute_result"

								    }

								   ],

								   "source": [

								    "model.most_similar(\"hello\")"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 8,

								   "metadata": {},

								   "outputs": [

								    {

								     "data": {

								      "text/plain": [

								       "[('coders', 0.6104331612586975),\n",

								       " ('coder', 0.6063331365585327),\n",

								       " ('Coding', 0.5804804563522339),\n",

								       " ('formatting', 0.5671651363372803),\n",

								       " ('soluble_receptors', 0.5576372146606445),\n",

								       " ('ICD9', 0.5571348667144775),\n",

								       " ('refactoring', 0.5495434999465942),\n",

								       " ('database_schemas', 0.5372464656829834),\n",

								       " ('recode', 0.534299373626709),\n",

								       " ('XHTML_CSS', 0.5328801870346069)]"

								      ]

								     },

								     "execution_count": 8,

								     "metadata": {},

								     "output_type": "execute_result"

								    }

								   ],

								   "source": [

								    "model.most_similar(\"coding\")"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 9,

								   "metadata": {},

								   "outputs": [

								    {

								     "data": {

								      "text/plain": [

								       "[('cats', 0.8099379539489746),\n",

								       " ('dog', 0.7609456777572632),\n",

								       " ('kitten', 0.7464985251426697),\n",

								       " ('feline', 0.7326233983039856),\n",

								       " ('beagle', 0.7150583267211914),\n",

								       " ('puppy', 0.7075453996658325),\n",

								       " ('pup', 0.6934291124343872),\n",

								       " ('pet', 0.6891531348228455),\n",

								       " ('felines', 0.6755931377410889),\n",

								       " ('chihuahua', 0.6709762215614319)]"

								      ]

								     },

								     "execution_count": 9,

								     "metadata": {},

								     "output_type": "execute_result"

								    }

								   ],

								   "source": [

								    "model.most_similar(\"cat\")"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 16,

								   "metadata": {},

								   "outputs": [

								    {

								     "name": "stdout",

								     "output_type": "stream",

								     "text": [

								      "hi globe \n"

								     ]

								    }

								   ],

								   "source": [

								    "def transformSentence(sentence):\n",

								    "    outputSentence = \"\"\n",

								    "    \n",

								    "    for word in sentence.split(\" \"):\n",

								    "        try:\n",

								    "            outputSentence += model.most_similar(word)[0][0] + \" \"\n",

								    "        except Exception:\n",

								    "            outputSentence += word + \" \"\n",

								    "    return outputSentence\n",

								    "\n",

								    "print(transformSentence(\"hello world\"))"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 12,

								   "metadata": {},

								   "outputs": [

								    {

								     "name": "stdout",

								     "output_type": "stream",

								     "text": [

								      "looks Mom No hand \n"

								     ]

								    }

								   ],

								   "source": [

								    "print(transformSentence(\"look mom no hands\"))"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 17,

								   "metadata": {},

								   "outputs": [

								    {

								     "name": "stdout",

								     "output_type": "stream",

								     "text": [

								      "This gen_eral concept of Clustering was to groups Data wtih similiar trait \n"

								     ]

								    }

								   ],

								   "source": [

								    "print(transformSentence(\"The general idea of clustering is to group data with similar traits\"))"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 52,

								   "metadata": {},

								   "outputs": [

								    {

								     "name": "stdout",

								     "output_type": "stream",

								     "text": [

								      "This manager concept of clusters was to groups datasets wtih similiar traits. \n"

								     ]

								    }

								   ],

								   "source": [

								    "def removeFromString(string, chars):\n",

								    "    for c in chars:\n",

								    "        string = string.replace(c, \"\")\n",

								    "    return string\n",

								    "\n",

								    "\n",

								    "def transformSentenceWithHeuristic(sentence):\n",

								    "    outputSentence = \"\"\n",

								    "    \n",

								    "    for word in sentence.split(\" \"):\n",

								    "        try:\n",

								    "            changed = False\n",

								    "            for w, _ in model.most_similar(word):\n",

								    "                clean = removeFromString(w, [' ', '_']).lower()\n",

								    "                if clean not in word.lower() and \"_\" not in w:\n",

								    "                    outputSentence += w + \" \"\n",

								    "                    changed = True\n",

								    "                    break\n",

								    "            outputSentence = outputSentence if changed else outputSentence + word + \" \"\n",

								    "        except Exception:\n",

								    "            outputSentence += word + \" \"\n",

								    "    return outputSentence\n",

								    "print(transformSentenceWithHeuristic(\"The general idea of clustering is to group data with similar traits.\"))"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 53,

								   "metadata": {},

								   "outputs": [

								    {

								     "name": "stdout",

								     "output_type": "stream",

								     "text": [

								      "Relax up and grabbing a drinks but that was day it I talking abut this hallucinogenic trips it was this 1981 film Fever Treatment. \n"

								     ]

								    }

								   ],

								   "source": [

								    "print(transformSentenceWithHeuristic(\"Sit down and grab a drink because it is time that we talk about the LSD trip that is the 1981 movie Shock Treatment.\"))"

								   ]

								  },

								  {

								   "cell_type": "code",

								   "execution_count": 54,

								   "metadata": {},

								   "outputs": [],

								   "source": [

								    "from sklearn.decomposition import IncrementalPCA    # inital reduction\n",

								    "from sklearn.manifold import TSNE                   # final reduction\n",

								    "import numpy as np                                  # array handling\n",

								    "\n",

								    "\n",

								    "def reduce_dimensions(model):\n",

								    "    num_dimensions = 2  # final num dimensions (2D, 3D, etc)\n",

								    "\n",

								    "    vectors = [] # positions in vector space\n",

								    "    labels = [] # keep track of words to label our data again later\n",

								    "    for word in model.wv.vocab:\n",

								    "        vectors.append(model.wv[word])\n",

								    "        labels.append(word)\n",

								    "\n",

								    "    # convert both lists into numpy vectors for reduction\n",

								    "    vectors = np.asarray(vectors)\n",

								    "    labels = np.asarray(labels)\n",

								    "\n",

								    "    # reduce using t-SNE\n",

								    "    vectors = np.asarray(vectors)\n",

								    "    tsne = TSNE(n_components=num_dimensions, random_state=0)\n",

								    "    vectors = tsne.fit_transform(vectors)\n",

								    "\n",

								    "    x_vals = [v[0] for v in vectors]\n",

								    "    y_vals = [v[1] for v in vectors]\n",

								    "    return x_vals, y_vals, labels\n",

								    "\n",

								    "\n",

								    "#x_vals, y_vals, labels = reduce_dimensions(model)"

								   ]

								  }

								 ],

								 "metadata": {

								  "kernelspec": {

								   "display_name": "Python 3",

								   "language": "python",

								   "name": "python3"

								  },

								  "language_info": {

								   "codemirror_mode": {

								    "name": "ipython",

								    "version": 3

								   },

								   "file_extension": ".py",

								   "mimetype": "text/x-python",

								   "name": "python",

								   "nbconvert_exporter": "python",

								   "pygments_lexer": "ipython3",

								   "version": "3.8.1"

								  }

								 },

								 "nbformat": 4,

								 "nbformat_minor": 4

								}