From 413ac81914abccfff1acca0f56e93a5cdf3be10d Mon Sep 17 00:00:00 2001 From: Jitin Kapila Date: Sun, 2 Aug 2020 15:32:52 +0530 Subject: [PATCH 1/2] Added spacy en_core_web_sm model download in Ch03 --- ..._DocVectors_using_averaging_Via_spacy.ipynb | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/Ch3/07_DocVectors_using_averaging_Via_spacy.ipynb b/Ch3/07_DocVectors_using_averaging_Via_spacy.ipynb index 8077d36..75eb933 100644 --- a/Ch3/07_DocVectors_using_averaging_Via_spacy.ipynb +++ b/Ch3/07_DocVectors_using_averaging_Via_spacy.ipynb @@ -14,7 +14,17 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# downloading en_core_web_sm, assuming spacy is already installed\n", + "!python -m spacy download en_core_web_sm" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", @@ -371,7 +381,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", @@ -401,9 +411,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.7.3" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } From 2d7347b27ecf31adf0baac225c55911c2d0b27d7 Mon Sep 17 00:00:00 2001 From: Jitin Kapila Date: Tue, 11 Aug 2020 00:14:37 +0530 Subject: [PATCH 2/2] Solved doc2vec and Fasttext with some minor other changes --- ...ng_lemmatization_stopword_postagging.ipynb | 1024 +++++++++-------- Ch4/01_OnePipeline_ManyClassifiers.ipynb | 171 +-- Ch4/02_Doc2Vec_Example.ipynb | 819 +++++++------ Ch4/04_FastText_Example.ipynb | 910 +++++++++------ 4 files changed, 1621 insertions(+), 1303 deletions(-) diff --git a/Ch2/04_Tokenization_Stemming_lemmatization_stopword_postagging.ipynb b/Ch2/04_Tokenization_Stemming_lemmatization_stopword_postagging.ipynb index 423389c..92706a6 100644 --- a/Ch2/04_Tokenization_Stemming_lemmatization_stopword_postagging.ipynb +++ b/Ch2/04_Tokenization_Stemming_lemmatization_stopword_postagging.ipynb @@ -1,520 +1,562 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "QnXzcZvdc-r6" - }, - "source": [ - "In this notebook we will demostrate how to perform tokenization,stemming,lemmatization and pos_tagging using libraries like [spacy](https://spacy.io/) and [nltk](https://www.nltk.org/) " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "R3xEmJpRc5r8" - }, - "outputs": [], - "source": [ - "#This will be our corpus which we will work on\n", - "corpus_original = \"Need to finalize the demo corpus which will be used for this notebook and it should be done soon !!. It should be done by the ending of this month. But will it? This notebook has been run 4 times !!\"\n", - "corpus = \"Need to finalize the demo corpus which will be used for this notebook & should be done soon !!. It should be done by the ending of this month. But will it? This notebook has been run 4 times !!\"" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 + "name": "Tokenization_Stemming_lemmatization_stopword_postagging.ipynb", + "provenance": [], + "collapsed_sections": [] }, - "colab_type": "code", - "id": "KHh_33IopPTf", - "outputId": "37c2db65-d10d-4254-80df-48a3f48713cd" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "need to finalize the demo corpus which will be used for this notebook & should be done soon !!. it should be done by the ending of this month. but will it? this notebook has been run 4 times !!\n" - ] + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" } - ], - "source": [ - "#lower case the corpus\n", - "corpus = corpus.lower()\n", - "print(corpus)" - ] }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "QnXzcZvdc-r6" + }, + "source": [ + "In this notebook we will demostrate how to perform tokenization,stemming,lemmatization and pos_tagging using libraries like [spacy](https://spacy.io/) and [nltk](https://www.nltk.org/) " + ] }, - "colab_type": "code", - "id": "3yaGf8RiqgBM", - "outputId": "5856d53c-63e9-4045-dc53-96bb98fba30e" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "need to finalize the demo corpus which will be used for this notebook & should be done soon !!. it should be done by the ending of this month. but will it? this notebook has been run times !!\n" - ] - } - ], - "source": [ - "#removing digits in the corpus\n", - "import re\n", - "corpus = re.sub(r'\\d+','', corpus)\n", - "print(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "R3xEmJpRc5r8", + "colab": {} + }, + "source": [ + "#This will be our corpus which we will work on\n", + "corpus_original = \"Need to finalize the demo corpus which will be used for this notebook and it should be done soon !!. It should be done by the ending of this month. But will it? This notebook has been run 4 times !!\"\n", + "corpus = \"Need to finalize the demo corpus which will be used for this notebook & should be done soon !!. It should be done by the ending of this month. But will it? This notebook has been run 4 times !!\"" + ], + "execution_count": 10, + "outputs": [] }, - "colab_type": "code", - "id": "v5Q--GItqzfu", - "outputId": "e0581860-574e-4f1e-f517-29b60b438ec8" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook has been run times \n" - ] - } - ], - "source": [ - "#removing punctuations\n", - "import string\n", - "corpus = corpus.translate(str.maketrans('', '', string.punctuation))\n", - "print(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "KHh_33IopPTf", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 54 + }, + "outputId": "fa12e7e4-aeb3-4053-be10-3cadad90d094" + }, + "source": [ + "#lower case the corpus\n", + "corpus = corpus.lower()\n", + "print(corpus)" + ], + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "text": [ + "need to finalize the demo corpus which will be used for this notebook & should be done soon !!. it should be done by the ending of this month. but will it? this notebook has been run 4 times !!\n" + ], + "name": "stdout" + } + ] }, - "colab_type": "code", - "id": "zmANqee9rK4N", - "outputId": "227843ab-d55f-4c9a-adff-cc253eb36450" - }, - "outputs": [ { - "data": { - "text/plain": [ - "'need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook has been run times'" + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "3yaGf8RiqgBM", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 54 + }, + "outputId": "859abb8b-3a34-4e23-bd8e-963520fb6ed3" + }, + "source": [ + "#removing digits in the corpus\n", + "import re\n", + "corpus = re.sub(r'\\d+','', corpus)\n", + "print(corpus)" + ], + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "text": [ + "need to finalize the demo corpus which will be used for this notebook & should be done soon !!. it should be done by the ending of this month. but will it? this notebook has been run times !!\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#removing trailing whitespaces\n", - "corpus = corpus.strip()\n", - "corpus" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 872 }, - "colab_type": "code", - "id": "KMuHZTpy9X_u", - "outputId": "9ec1f466-2e24-4262-b48c-a6c36f3dcb7a" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: spacy in /home/etherealenvy/miniconda3/envs/practicalnlp/lib/python3.6/site-packages (2.1.3)\n", - "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/etherealenvy/miniconda3/envs/practicalnlp/lib/python3.6/site-packages (from spacy) (2.0.3)\n", - "Requirement already satisfied: srsly<1.1.0,>=0.0.5 in /home/etherealenvy/miniconda3/envs/practicalnlp/lib/python3.6/site-packages (from spacy) (1.0.2)\n", - "Requirement already satisfied: plac<1.0.0,>=0.9.6 in /home/etherealenvy/miniconda3/envs/practicalnlp/lib/python3.6/site-packages (from spacy) (0.9.6)\n", - "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/etherealenvy/miniconda3/envs/practicalnlp/lib/python3.6/site-packages (from spacy) (1.0.2)\n", - "Requirement already satisfied: blis<0.3.0,>=0.2.2 in /home/etherealenvy/miniconda3/envs/practicalnlp/lib/python3.6/site-packages (from spacy) (0.2.4)\n", - "Requirement already satisfied: wasabi<1.1.0,>=0.2.0 in /home/etherealenvy/miniconda3/envs/practicalnlp/lib/python3.6/site-packages (from spacy) (0.6.0)\n", - "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/etherealenvy/.local/lib/python3.6/site-packages (from spacy) (2.23.0)\n", - "Requirement already satisfied: numpy>=1.15.0 in /home/etherealenvy/miniconda3/envs/practicalnlp/lib/python3.6/site-packages (from spacy) (1.18.5)\n", - "Requirement already satisfied: jsonschema<3.0.0,>=2.6.0 in /home/etherealenvy/miniconda3/envs/practicalnlp/lib/python3.6/site-packages (from spacy) (2.6.0)\n", - "Requirement already satisfied: thinc<7.1.0,>=7.0.2 in /home/etherealenvy/miniconda3/envs/practicalnlp/lib/python3.6/site-packages (from spacy) (7.0.8)\n", - "Requirement already satisfied: preshed<2.1.0,>=2.0.1 in /home/etherealenvy/miniconda3/envs/practicalnlp/lib/python3.6/site-packages (from spacy) (2.0.1)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /home/etherealenvy/.local/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2020.4.5.1)\n", - "Requirement already satisfied: idna<3,>=2.5 in /home/etherealenvy/.local/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.9)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /home/etherealenvy/.local/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /home/etherealenvy/.local/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.25.9)\n", - "Requirement already satisfied: tqdm<5.0.0,>=4.10.0 in /home/etherealenvy/miniconda3/envs/practicalnlp/lib/python3.6/site-packages (from thinc<7.1.0,>=7.0.2->spacy) (4.46.1)\n", - "Requirement already satisfied: en_core_web_sm==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm==2.1.0 in /home/etherealenvy/miniconda3/envs/practicalnlp/lib/python3.6/site-packages (2.1.0)\n", - "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", - "You can now load the model via spacy.load('en_core_web_sm')\n", - "\u001b[38;5;2m✔ Linking successful\u001b[0m\n", - "/home/etherealenvy/miniconda3/envs/practicalnlp/lib/python3.6/site-packages/en_core_web_sm\n", - "-->\n", - "/home/etherealenvy/miniconda3/envs/practicalnlp/lib/python3.6/site-packages/spacy/data/en\n", - "You can now load the model via spacy.load('en')\n" - ] - } - ], - "source": [ - "!pip install spacy\n", - "!python -m spacy download en" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "nfJx3MnVj_ph" - }, - "source": [ - "### Tokenizing the text" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 250 + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "v5Q--GItqzfu", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "82fec440-1251-4ba1-cdf4-f1cab3c2a607" + }, + "source": [ + "#removing punctuations\n", + "import string\n", + "corpus = corpus.translate(str.maketrans('', '', string.punctuation))\n", + "print(corpus)" + ], + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "text": [ + "need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook has been run times \n" + ], + "name": "stdout" + } + ] }, - "colab_type": "code", - "id": "OUz580k2sMqf", - "outputId": "5195f746-7e75-4cfa-d763-4856359baf05" - }, - "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to\n", - "[nltk_data] /home/etherealenvy/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n", - "[nltk_data] Downloading package punkt to\n", - "[nltk_data] /home/etherealenvy/nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n" - ] + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "zmANqee9rK4N", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + }, + "outputId": "6105b616-e770-409a-88b0-3f23dd3ffd72" + }, + "source": [ + "#removing trailing whitespaces\n", + "corpus = ' '.join([token for token in corpus.split()])\n", + "corpus" + ], + "execution_count": 14, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook has been run times'" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 14 + } + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "NLTK\n", - "Tokenized corpus: ['need', 'to', 'finalize', 'the', 'demo', 'corpus', 'which', 'will', 'be', 'used', 'for', 'this', 'notebook', 'should', 'be', 'done', 'soon', 'it', 'should', 'be', 'done', 'by', 'the', 'ending', 'of', 'this', 'month', 'but', 'will', 'it', 'this', 'notebook', 'has', 'been', 'run', 'times']\n", - "Tokenized corpus without stopwords: ['need', 'finalize', 'demo', 'corpus', 'used', 'notebook', 'done', 'soon', 'done', 'ending', 'month', 'notebook', 'run', 'times']\n", - "\n", - "Spacy:\n", - "Tokenized Corpus: ['need', 'to', 'finalize', 'the', 'demo', 'corpus', 'which', 'will', 'be', 'used', 'for', 'this', 'notebook', 'should', 'be', 'done', 'soon', 'it', 'should', 'be', 'done', 'by', 'the', 'ending', 'of', 'this', 'month', 'but', 'will', 'it', 'this', 'notebook', 'has', 'been', 'run', 'times']\n", - "Tokenized corpus without stopwords ['need', 'finalize', 'demo', 'corpus', 'notebook', 'soon', 'ending', 'month', 'notebook', 'run', 'times']\n", - "Difference between NLTK and spaCy output:\n", - " {'used', 'done'}\n" - ] - } - ], - "source": [ - "from pprint import pprint\n", - "##NLTK\n", - "import nltk\n", - "from nltk.corpus import stopwords\n", - "nltk.download('stopwords')\n", - "nltk.download('punkt')\n", - "from nltk.tokenize import word_tokenize\n", - "stop_words_nltk = set(stopwords.words('english'))\n", - "\n", - "tokenized_corpus_nltk = word_tokenize(corpus)\n", - "print(\"\\nNLTK\\nTokenized corpus:\",tokenized_corpus_nltk)\n", - "tokenized_corpus_without_stopwords = [i for i in tokenized_corpus_nltk if not i in stop_words_nltk]\n", - "print(\"Tokenized corpus without stopwords:\",tokenized_corpus_without_stopwords)\n", - "\n", - "\n", - "##SPACY \n", - "from spacy.lang.en.stop_words import STOP_WORDS\n", - "import spacy\n", - "spacy_model = spacy.load('en_core_web_sm')\n", - "\n", - "stopwords_spacy = spacy_model.Defaults.stop_words\n", - "print(\"\\nSpacy:\")\n", - "tokenized_corpus_spacy = word_tokenize(corpus)\n", - "print(\"Tokenized Corpus:\",text_tokens)\n", - "tokens_without_sw= [word for word in tokenized_corpus_spacy if not word in stopwords_spacy]\n", - "\n", - "print(\"Tokenized corpus without stopwords\",tokens_without_sw)\n", - "\n", - "\n", - "print(\"Difference between NLTK and spaCy output:\\n\",\n", - " set(tokenized_corpus_without_stopwords)-set(tokens_without_sw))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "eRH_ltkD-HpA" - }, - "source": [ - "Notice the difference output after stopword removal using nltk and spacy" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "tGcwD1JlkEao" - }, - "source": [ - "### Stemming" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 87 + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "KMuHZTpy9X_u", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 759 + }, + "outputId": "76adf317-3d10-46f7-f955-6edf1e9099d4" + }, + "source": [ + "!pip install spacy\n", + "!python -m spacy download en_core_web_sm" + ], + "execution_count": 15, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Requirement already satisfied: spacy in /usr/local/lib/python3.6/dist-packages (2.2.4)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy) (2.0.3)\n", + "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (7.4.0)\n", + "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (0.7.1)\n", + "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (1.18.5)\n", + "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy) (1.0.2)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (1.0.2)\n", + "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.6/dist-packages (from spacy) (1.0.0)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (2.23.0)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from spacy) (49.2.0)\n", + "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from spacy) (1.1.3)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy) (3.0.2)\n", + "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (0.4.1)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (4.41.1)\n", + "Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy) (1.7.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2020.6.20)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.24.3)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy) (3.1.0)\n", + "Requirement already satisfied: en_core_web_sm==2.2.5 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5 in /usr/local/lib/python3.6/dist-packages (2.2.5)\n", + "Requirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.6/dist-packages (from en_core_web_sm==2.2.5) (2.2.4)\n", + "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.18.5)\n", + "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.7.1)\n", + "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.2)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.2)\n", + "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.1.3)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.23.0)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.2)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (4.41.1)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (49.2.0)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.3)\n", + "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.4.1)\n", + "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (7.4.0)\n", + "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.0)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.10)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2020.6.20)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.24.3)\n", + "Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.7.0)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.1.0)\n", + "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", + "You can now load the model via spacy.load('en_core_web_sm')\n" + ], + "name": "stdout" + } + ] }, - "colab_type": "code", - "id": "ibEpzcv0sdW8", - "outputId": "f40aa595-f67f-4042-e9bf-e5b4a160d524" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Before Stemming:\n", - "need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook has been run times\n", - "After Stemming:\n", - "need to final the demo corpu which will be use for thi notebook should be done soon it should be done by the end of thi month but will it thi notebook ha been run time " - ] - } - ], - "source": [ - "from nltk.stem import PorterStemmer\n", - "from nltk.tokenize import word_tokenize\n", - "stemmer= PorterStemmer()\n", - "\n", - "print(\"Before Stemming:\")\n", - "print(corpus)\n", - "\n", - "print(\"After Stemming:\")\n", - "for word in tokenized_corpus_nltk:\n", - " print(stemmer.stem(word),end=\" \")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "9Wy6cwvYkJeR" - }, - "source": [ - "### Lemmatization" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 70 + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "nfJx3MnVj_ph" + }, + "source": [ + "### Tokenizing the text" + ] }, - "colab_type": "code", - "id": "27KvL4ZE-fqJ", - "outputId": "41dc046e-14dd-4d4f-83ca-2b3af4532593" - }, - "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package wordnet to\n", - "[nltk_data] /home/etherealenvy/nltk_data...\n", - "[nltk_data] Unzipping corpora/wordnet.zip.\n" - ] + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "OUz580k2sMqf", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 272 + }, + "outputId": "da21bf1e-444b-4077-c823-e58b4986a35f" + }, + "source": [ + "from pprint import pprint\n", + "##NLTK\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt')\n", + "from nltk.tokenize import word_tokenize\n", + "stop_words_nltk = set(stopwords.words('english'))\n", + "\n", + "tokenized_corpus_nltk = word_tokenize(corpus)\n", + "print(\"\\nNLTK\\nTokenized corpus:\",tokenized_corpus_nltk)\n", + "tokenized_corpus_without_stopwords = [i for i in tokenized_corpus_nltk if not i in stop_words_nltk]\n", + "print(\"Tokenized corpus without stopwords:\",tokenized_corpus_without_stopwords)\n", + "\n", + "\n", + "##SPACY \n", + "from spacy.lang.en.stop_words import STOP_WORDS\n", + "import spacy\n", + "spacy_model = spacy.load('en_core_web_sm')\n", + "\n", + "stopwords_spacy = spacy_model.Defaults.stop_words\n", + "print(\"\\nSpacy:\")\n", + "tokenized_corpus_spacy = word_tokenize(corpus)\n", + "print(\"Tokenized Corpus:\",tokenized_corpus_spacy)\n", + "tokens_without_sw= [word for word in tokenized_corpus_spacy if not word in stopwords_spacy]\n", + "\n", + "print(\"Tokenized corpus without stopwords\",tokens_without_sw)\n", + "\n", + "\n", + "print(\"Difference between NLTK and spaCy output:\\n\",\n", + " set(tokenized_corpus_without_stopwords)-set(tokens_without_sw))" + ], + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "\n", + "NLTK\n", + "Tokenized corpus: ['need', 'to', 'finalize', 'the', 'demo', 'corpus', 'which', 'will', 'be', 'used', 'for', 'this', 'notebook', 'should', 'be', 'done', 'soon', 'it', 'should', 'be', 'done', 'by', 'the', 'ending', 'of', 'this', 'month', 'but', 'will', 'it', 'this', 'notebook', 'has', 'been', 'run', 'times']\n", + "Tokenized corpus without stopwords: ['need', 'finalize', 'demo', 'corpus', 'used', 'notebook', 'done', 'soon', 'done', 'ending', 'month', 'notebook', 'run', 'times']\n", + "\n", + "Spacy:\n", + "Tokenized Corpus: ['need', 'to', 'finalize', 'the', 'demo', 'corpus', 'which', 'will', 'be', 'used', 'for', 'this', 'notebook', 'should', 'be', 'done', 'soon', 'it', 'should', 'be', 'done', 'by', 'the', 'ending', 'of', 'this', 'month', 'but', 'will', 'it', 'this', 'notebook', 'has', 'been', 'run', 'times']\n", + "Tokenized corpus without stopwords ['need', 'finalize', 'demo', 'corpus', 'notebook', 'soon', 'ending', 'month', 'notebook', 'run', 'times']\n", + "Difference between NLTK and spaCy output:\n", + " {'done', 'used'}\n" + ], + "name": "stdout" + } + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook ha been run time " - ] - } - ], - "source": [ - "from nltk.stem import WordNetLemmatizer\n", - "from nltk.tokenize import word_tokenize\n", - "nltk.download('wordnet')\n", - "lemmatizer=WordNetLemmatizer()\n", - "\n", - "\n", - "for word in tokenized_corpus_nltk:\n", - " print(lemmatizer.lemmatize(word),end=\" \")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "h8uCGA8ukMfQ" - }, - "source": [ - "### POS Tagging" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "eRH_ltkD-HpA" + }, + "source": [ + "Notice the difference output after stopword removal using nltk and spacy" + ] }, - "colab_type": "code", - "id": "kZqBxLDz-6cu", - "outputId": "9b89af94-c433-4099-b1c3-3adefecae984" - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "POS Tagging using spacy:\n", - "Need : VERB\n", - "to : PART\n", - "finalize : VERB\n", - "the : DET\n", - "demo : ADJ\n", - "corpus : NOUN\n", - "which : DET\n", - "will : VERB\n", - "be : VERB\n", - "used : VERB\n", - "for : ADP\n", - "this : DET\n", - "notebook : NOUN\n", - "and : CCONJ\n", - "it : PRON\n", - "should : VERB\n", - "be : VERB\n", - "done : VERB\n", - "soon : ADV\n", - "! : PUNCT\n", - "! : PUNCT\n", - ". : PUNCT\n", - "It : PRON\n", - "should : VERB\n", - "be : VERB\n", - "done : VERB\n", - "by : ADP\n", - "the : DET\n", - "ending : NOUN\n", - "of : ADP\n", - "this : DET\n", - "month : NOUN\n", - ". : PUNCT\n", - "But : CCONJ\n", - "will : VERB\n", - "it : PRON\n", - "? : PUNCT\n", - "This : DET\n", - "notebook : NOUN\n", - "has : VERB\n", - "been : VERB\n", - "run : VERB\n", - "4 : NUM\n", - "times : NOUN\n", - "! : PUNCT\n", - "! : PUNCT\n", - "POS Tagging using NLTK:\n" - ] + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tGcwD1JlkEao" + }, + "source": [ + "### Stemming" + ] }, { - "ename": "LookupError", - "evalue": "\n**********************************************************************\n Resource \u001b[93maveraged_perceptron_tagger\u001b[0m not found.\n Please use the NLTK Downloader to obtain the resource:\n\n \u001b[31m>>> import nltk\n >>> nltk.download('averaged_perceptron_tagger')\n \u001b[0m\n For more information see: https://www.nltk.org/data.html\n\n Attempted to load \u001b[93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle\u001b[0m\n\n Searched in:\n - '/home/etherealenvy/nltk_data'\n - '/home/etherealenvy/miniconda3/envs/practicalnlp/nltk_data'\n - '/home/etherealenvy/miniconda3/envs/practicalnlp/share/nltk_data'\n - '/home/etherealenvy/miniconda3/envs/practicalnlp/lib/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/local/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/local/lib/nltk_data'\n**********************************************************************\n", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mLookupError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;31m# nltk.download('averaged_perceptron_tagger')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"POS Tagging using NLTK:\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0mpprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpos_tag\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword_tokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcorpus_original\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/.local/lib/python3.6/site-packages/nltk/tag/__init__.py\u001b[0m in \u001b[0;36mpos_tag\u001b[0;34m(tokens, tagset, lang)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0mrtype\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtuple\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 159\u001b[0m \"\"\"\n\u001b[0;32m--> 160\u001b[0;31m \u001b[0mtagger\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_tagger\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlang\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 161\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_pos_tag\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtagset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtagger\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlang\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/lib/python3.6/site-packages/nltk/tag/__init__.py\u001b[0m in \u001b[0;36m_get_tagger\u001b[0;34m(lang)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[0mtagger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0map_russian_model_loc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 106\u001b[0;31m \u001b[0mtagger\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPerceptronTagger\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 107\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mtagger\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 108\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/lib/python3.6/site-packages/nltk/tag/perceptron.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, load)\u001b[0m\n\u001b[1;32m 166\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 167\u001b[0m AP_MODEL_LOC = \"file:\" + str(\n\u001b[0;32m--> 168\u001b[0;31m \u001b[0mfind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"taggers/averaged_perceptron_tagger/\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mPICKLE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 169\u001b[0m )\n\u001b[1;32m 170\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mAP_MODEL_LOC\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.local/lib/python3.6/site-packages/nltk/data.py\u001b[0m in \u001b[0;36mfind\u001b[0;34m(resource_name, paths)\u001b[0m\n\u001b[1;32m 583\u001b[0m \u001b[0msep\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"*\"\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m70\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 584\u001b[0m \u001b[0mresource_not_found\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\\n%s\\n%s\\n%s\\n\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0msep\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 585\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mLookupError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresource_not_found\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 586\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 587\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mLookupError\u001b[0m: \n**********************************************************************\n Resource \u001b[93maveraged_perceptron_tagger\u001b[0m not found.\n Please use the NLTK Downloader to obtain the resource:\n\n \u001b[31m>>> import nltk\n >>> nltk.download('averaged_perceptron_tagger')\n \u001b[0m\n For more information see: https://www.nltk.org/data.html\n\n Attempted to load \u001b[93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle\u001b[0m\n\n Searched in:\n - '/home/etherealenvy/nltk_data'\n - '/home/etherealenvy/miniconda3/envs/practicalnlp/nltk_data'\n - '/home/etherealenvy/miniconda3/envs/practicalnlp/share/nltk_data'\n - '/home/etherealenvy/miniconda3/envs/practicalnlp/lib/nltk_data'\n - '/usr/share/nltk_data'\n - '/usr/local/share/nltk_data'\n - '/usr/lib/nltk_data'\n - '/usr/local/lib/nltk_data'\n**********************************************************************\n" - ] + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "ibEpzcv0sdW8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 84 + }, + "outputId": "18f77b85-3a8e-4e89-df28-3bd6342ac594" + }, + "source": [ + "from nltk.stem import PorterStemmer\n", + "from nltk.tokenize import word_tokenize\n", + "stemmer= PorterStemmer()\n", + "\n", + "print(\"Before Stemming:\")\n", + "print(corpus)\n", + "\n", + "print(\"After Stemming:\")\n", + "for word in tokenized_corpus_nltk:\n", + " print(stemmer.stem(word),end=\" \")" + ], + "execution_count": 18, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Before Stemming:\n", + "need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook has been run times\n", + "After Stemming:\n", + "need to final the demo corpu which will be use for thi notebook should be done soon it should be done by the end of thi month but will it thi notebook ha been run time " + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "9Wy6cwvYkJeR" + }, + "source": [ + "### Lemmatization" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "27KvL4ZE-fqJ", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 67 + }, + "outputId": "d8b6778f-79b7-4dd4-8832-da29d75dc3a8" + }, + "source": [ + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.tokenize import word_tokenize\n", + "nltk.download('wordnet')\n", + "lemmatizer=WordNetLemmatizer()\n", + "\n", + "\n", + "for word in tokenized_corpus_nltk:\n", + " print(lemmatizer.lemmatize(word),end=\" \")" + ], + "execution_count": 19, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/wordnet.zip.\n", + "need to finalize the demo corpus which will be used for this notebook should be done soon it should be done by the ending of this month but will it this notebook ha been run time " + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "h8uCGA8ukMfQ" + }, + "source": [ + "### POS Tagging" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "kZqBxLDz-6cu", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "a8503608-0352-4c00-82fe-789d874b5655" + }, + "source": [ + "#POS tagging using spacy\n", + "doc = spacy_model(corpus_original) \n", + " \n", + "print(\"POS Tagging using spacy:\") \n", + "# Token and Tag \n", + "for token in doc: \n", + " print(token,\":\", token.pos_)\n", + "\n", + "\n", + "\n", + "#pos tagging using nltk\n", + "nltk.download('averaged_perceptron_tagger')\n", + "print(\"POS Tagging using NLTK:\")\n", + "pprint(nltk.pos_tag(word_tokenize(corpus_original)))" + ], + "execution_count": 20, + "outputs": [ + { + "output_type": "stream", + "text": [ + "POS Tagging using spacy:\n", + "Need : VERB\n", + "to : PART\n", + "finalize : VERB\n", + "the : DET\n", + "demo : NOUN\n", + "corpus : NOUN\n", + "which : DET\n", + "will : VERB\n", + "be : AUX\n", + "used : VERB\n", + "for : ADP\n", + "this : DET\n", + "notebook : NOUN\n", + "and : CCONJ\n", + "it : PRON\n", + "should : VERB\n", + "be : AUX\n", + "done : VERB\n", + "soon : ADV\n", + "! : PUNCT\n", + "! : PUNCT\n", + ". : PUNCT\n", + "It : PRON\n", + "should : VERB\n", + "be : AUX\n", + "done : VERB\n", + "by : ADP\n", + "the : DET\n", + "ending : NOUN\n", + "of : ADP\n", + "this : DET\n", + "month : NOUN\n", + ". : PUNCT\n", + "But : CCONJ\n", + "will : VERB\n", + "it : PRON\n", + "? : PUNCT\n", + "This : DET\n", + "notebook : NOUN\n", + "has : AUX\n", + "been : AUX\n", + "run : VERB\n", + "4 : NUM\n", + "times : NOUN\n", + "! : PUNCT\n", + "! : PUNCT\n", + "[nltk_data] Downloading package averaged_perceptron_tagger to\n", + "[nltk_data] /root/nltk_data...\n", + "[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\n", + "POS Tagging using NLTK:\n", + "[('Need', 'NN'),\n", + " ('to', 'TO'),\n", + " ('finalize', 'VB'),\n", + " ('the', 'DT'),\n", + " ('demo', 'NN'),\n", + " ('corpus', 'NN'),\n", + " ('which', 'WDT'),\n", + " ('will', 'MD'),\n", + " ('be', 'VB'),\n", + " ('used', 'VBN'),\n", + " ('for', 'IN'),\n", + " ('this', 'DT'),\n", + " ('notebook', 'NN'),\n", + " ('and', 'CC'),\n", + " ('it', 'PRP'),\n", + " ('should', 'MD'),\n", + " ('be', 'VB'),\n", + " ('done', 'VBN'),\n", + " ('soon', 'RB'),\n", + " ('!', '.'),\n", + " ('!', '.'),\n", + " ('.', '.'),\n", + " ('It', 'PRP'),\n", + " ('should', 'MD'),\n", + " ('be', 'VB'),\n", + " ('done', 'VBN'),\n", + " ('by', 'IN'),\n", + " ('the', 'DT'),\n", + " ('ending', 'VBG'),\n", + " ('of', 'IN'),\n", + " ('this', 'DT'),\n", + " ('month', 'NN'),\n", + " ('.', '.'),\n", + " ('But', 'CC'),\n", + " ('will', 'MD'),\n", + " ('it', 'PRP'),\n", + " ('?', '.'),\n", + " ('This', 'DT'),\n", + " ('notebook', 'NN'),\n", + " ('has', 'VBZ'),\n", + " ('been', 'VBN'),\n", + " ('run', 'VBN'),\n", + " ('4', 'CD'),\n", + " ('times', 'NNS'),\n", + " ('!', '.'),\n", + " ('!', '.')]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "zWdmz6lFkpEI" + }, + "source": [ + "There are various other libraries you can use to perform these common pre-processing steps" + ] } - ], - "source": [ - "#POS tagging using spacy\n", - "doc = spacy_model(corpus_original) \n", - " \n", - "print(\"POS Tagging using spacy:\") \n", - "# Token and Tag \n", - "for token in doc: \n", - " print(token,\":\", token.pos_)\n", - "\n", - "\n", - "\n", - "#pos tagging using nltk\n", - "# nltk.download('averaged_perceptron_tagger')\n", - "print(\"POS Tagging using NLTK:\")\n", - "pprint(nltk.pos_tag(word_tokenize(corpus_original)))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "zWdmz6lFkpEI" - }, - "source": [ - "There are various other libraries you can use to perform these common pre-processing steps" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "Tokenization_Stemming_lemmatization_stopword_postagging.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} + ] +} \ No newline at end of file diff --git a/Ch4/01_OnePipeline_ManyClassifiers.ipynb b/Ch4/01_OnePipeline_ManyClassifiers.ipynb index 7425835..44b5c37 100644 --- a/Ch4/01_OnePipeline_ManyClassifiers.ipynb +++ b/Ch4/01_OnePipeline_ManyClassifiers.ipynb @@ -54,7 +54,11 @@ "metadata": { "id": "QBvvarqE5xWm", "colab_type": "code", - "colab": {} + "colab": { + "base_uri": "https://localhost:8080/", + "height": 70 + }, + "outputId": "aa45dbb3-dde0-436a-ab7b-cb6ea9bcbcf1" }, "source": [ "import numpy as np\n", @@ -87,8 +91,17 @@ "#import time function from time module to track the training duration\n", "from time import time" ], - "execution_count": 0, - "outputs": [] + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/dist-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.feature_extraction.stop_words module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.feature_extraction.text. Anything that cannot be imported from sklearn.feature_extraction.text is now part of the private API.\n", + " warnings.warn(message, FutureWarning)\n" + ], + "name": "stderr" + } + ] }, { "cell_type": "markdown", @@ -105,37 +118,36 @@ "metadata": { "id": "tZkcGcXp9wq1", "colab_type": "code", - "outputId": "4943ab36-3441-4655-82b0-61b2c4cec376", "colab": { "base_uri": "https://localhost:8080/", - "height": 106 - } + "height": 269 + }, + "outputId": "ea1bb924-36da-4a12-e3c9-30d1808f4660" }, "source": [ - "!apt-get install -y -qq software-properties-common python3-software-properties module-init-tools\n", - "!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null\n", - "!apt-get update -qq 2>&1 > /dev/null\n", - "!apt-get -y install -qq google-drive-ocamlfuse fuse\n", - "from google.colab import auth\n", - "auth.authenticate_user()\n", - "from oauth2client.client import GoogleCredentials\n", - "creds = GoogleCredentials.get_application_default()\n", - "import getpass\n", - "!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL\n", - "vcode = getpass.getpass()\n", - "!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}\n", - "!mkdir -p drive\n", - "!google-drive-ocamlfuse drive" + "!wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv\n", + "!ls -lah DATAPATH" ], - "execution_count": 0, + "execution_count": 2, "outputs": [ { "output_type": "stream", "text": [ - "Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force\n", - "··········\n", - "Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force\n", - "Please enter the verification code: Access token retrieved correctly.\n" + "--2020-08-10 11:32:30-- https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Full-Economic-News-DFE-839861.csv\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 12383529 (12M) [text/plain]\n", + "Saving to: ‘DATAPATH/Full-Economic-News-DFE-839861.csv’\n", + "\n", + "Full-Economic-News- 100%[===================>] 11.81M 35.5MB/s in 0.3s \n", + "\n", + "2020-08-10 11:32:31 (35.5 MB/s) - ‘DATAPATH/Full-Economic-News-DFE-839861.csv’ saved [12383529/12383529]\n", + "\n", + "total 12M\n", + "drwxr-xr-x 2 root root 4.0K Aug 10 11:32 .\n", + "drwxr-xr-x 1 root root 4.0K Aug 10 11:32 ..\n", + "-rw-r--r-- 1 root root 12M Aug 10 11:32 Full-Economic-News-DFE-839861.csv\n" ], "name": "stdout" } @@ -146,19 +158,19 @@ "metadata": { "id": "LbED8Q185xWu", "colab_type": "code", - "outputId": "f48a6b5a-c95c-4134-927d-768b86d4b746", "colab": { "base_uri": "https://localhost:8080/", - "height": 104 - } + "height": 101 + }, + "outputId": "6bc758e1-c542-4309-a8b7-f9ae406bde7d" }, "source": [ - "our_data = pd.read_csv(\"DATAPATH\" , encoding = \"ISO-8859-1\" )\n", + "our_data = pd.read_csv(\"DATAPATH/Full-Economic-News-DFE-839861.csv\" , encoding = \"ISO-8859-1\" )\n", "\n", "display(our_data.shape) #Number of rows (instances) and columns in the dataset\n", "our_data[\"relevance\"].value_counts()/our_data.shape[0] #Class distribution in the dataset" ], - "execution_count": 0, + "execution_count": 3, "outputs": [ { "output_type": "display_data", @@ -184,7 +196,7 @@ "metadata": { "tags": [] }, - "execution_count": 9 + "execution_count": 3 } ] }, @@ -203,11 +215,11 @@ "metadata": { "id": "BYW_S3585xXF", "colab_type": "code", - "outputId": "233a6037-4a02-466e-bb59-21fb6ce9182c", "colab": { "base_uri": "https://localhost:8080/", "height": 34 - } + }, + "outputId": "988c2d6e-9826-4651-d6b6-2464d2479062" }, "source": [ "# convert label to a numerical variable\n", @@ -217,7 +229,7 @@ "our_data = our_data[[\"text\",\"relevance\"]] #Let us take only the two columns we need.\n", "our_data.shape" ], - "execution_count": 0, + "execution_count": 4, "outputs": [ { "output_type": "execute_result", @@ -229,7 +241,7 @@ "metadata": { "tags": [] }, - "execution_count": 10 + "execution_count": 4 } ] }, @@ -269,7 +281,7 @@ " #remove punctuation and numbers\n", " return doc" ], - "execution_count": 0, + "execution_count": 5, "outputs": [] }, { @@ -295,11 +307,11 @@ "metadata": { "id": "GimJJHhg5xYl", "colab_type": "code", - "outputId": "4dbac8e7-df8f-4374-b9b1-437149a6fb19", "colab": { "base_uri": "https://localhost:8080/", - "height": 69 - } + "height": 67 + }, + "outputId": "e8d8dbea-9a85-4ca9-c41a-6cdd092b68df" }, "source": [ "import sklearn\n", @@ -316,7 +328,7 @@ "print(X_train.shape, y_train.shape)\n", "print(X_test.shape, y_test.shape)" ], - "execution_count": 0, + "execution_count": 6, "outputs": [ { "output_type": "stream", @@ -334,11 +346,11 @@ "metadata": { "id": "gsUyIBUD5xZI", "colab_type": "code", - "outputId": "26cd24c2-84ab-4e02-e873-1544254e8ce9", "colab": { "base_uri": "https://localhost:8080/", "height": 34 - } + }, + "outputId": "f704fb2b-d1f1-4c3a-d220-f05ee1384a61" }, "source": [ "#Step 2-3: Preprocess and Vectorize train and test data\n", @@ -349,7 +361,7 @@ "print(X_train_dtm.shape, X_test_dtm.shape)\n", "#i.e., the dimension of our feature vector is 49753!" ], - "execution_count": 0, + "execution_count": 7, "outputs": [ { "output_type": "stream", @@ -365,11 +377,11 @@ "metadata": { "id": "nDLwA4CL5xZq", "colab_type": "code", - "outputId": "1340afd9-2e5d-43de-9e46-abc50f8dd9e3", "colab": { "base_uri": "https://localhost:8080/", - "height": 52 - } + "height": 50 + }, + "outputId": "d1fea858-5e17-4d3e-9450-ae5702114261" }, "source": [ "#Step 3: Train the classifier and predict for test data\n", @@ -377,13 +389,13 @@ "%time nb.fit(X_train_dtm, y_train)#train the model(timing it with an IPython \"magic command\")\n", "y_pred_class = nb.predict(X_test_dtm)#make class predictions for X_test_dtm" ], - "execution_count": 0, + "execution_count": 8, "outputs": [ { "output_type": "stream", "text": [ - "CPU times: user 12.6 ms, sys: 0 ns, total: 12.6 ms\n", - "Wall time: 13.8 ms\n" + "CPU times: user 11.9 ms, sys: 1.08 ms, total: 13 ms\n", + "Wall time: 14.9 ms\n" ], "name": "stdout" } @@ -394,11 +406,11 @@ "metadata": { "id": "LiCHjvc75xZ3", "colab_type": "code", - "outputId": "dbaf55e1-faea-44d5-c2c7-e24bdd7203f7", "colab": { "base_uri": "https://localhost:8080/", - "height": 495 - } + "height": 494 + }, + "outputId": "5308276c-78cd-48c3-bf5c-1f6eb5321152" }, "source": [ "#Step 4: Evaluate the classifier using various measures\n", @@ -452,7 +464,7 @@ "y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]\n", "print(\"ROC_AOC_Score: \", metrics.roc_auc_score(y_test, y_pred_prob))" ], - "execution_count": 0, + "execution_count": 9, "outputs": [ { "output_type": "stream", @@ -465,7 +477,7 @@ { "output_type": "display_data", "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -492,11 +504,11 @@ "metadata": { "id": "ylOI4OsD5xaE", "colab_type": "code", - "outputId": "38070c4c-90a7-474f-958e-b672d2318c2d", "colab": { "base_uri": "https://localhost:8080/", - "height": 512 - } + "height": 510 + }, + "outputId": "1db10ac7-a249-474a-b9f7-7d4e57af93c5" }, "source": [ "vect = CountVectorizer(preprocessor=clean, max_features=5000) #Step-1\n", @@ -512,13 +524,13 @@ "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", " title='Confusion matrix with max 5000 features')" ], - "execution_count": 0, + "execution_count": 10, "outputs": [ { "output_type": "stream", "text": [ - "CPU times: user 5.24 ms, sys: 0 ns, total: 5.24 ms\n", - "Wall time: 5.25 ms\n", + "CPU times: user 5.32 ms, sys: 0 ns, total: 5.32 ms\n", + "Wall time: 5.33 ms\n", "Accuracy: 0.6876876876876877\n" ], "name": "stdout" @@ -526,7 +538,7 @@ { "output_type": "display_data", "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -553,11 +565,11 @@ "metadata": { "id": "0v7pM9hB5xbA", "colab_type": "code", - "outputId": "80d7503c-9d10-45a2-ff46-705ded6530a5", "colab": { "base_uri": "https://localhost:8080/", - "height": 634 - } + "height": 628 + }, + "outputId": "efcfc795-ad19-44fe-e557-67dc8301363c" }, "source": [ "from sklearn.linear_model import LogisticRegression #import\n", @@ -576,7 +588,7 @@ "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", " title='Confusion matrix with normalization')" ], - "execution_count": 0, + "execution_count": 11, "outputs": [ { "output_type": "stream", @@ -603,7 +615,7 @@ { "output_type": "display_data", "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -630,11 +642,11 @@ "metadata": { "id": "XJLKusAQ5xbf", "colab_type": "code", - "outputId": "bffaee1e-548d-4997-d003-f3abd1d306e9", "colab": { "base_uri": "https://localhost:8080/", - "height": 529 - } + "height": 527 + }, + "outputId": "2d8342ee-cf45-45d5-8a08-cf996b6fbdd8" }, "source": [ "from sklearn.svm import LinearSVC\n", @@ -657,7 +669,7 @@ "plot_confusion_matrix(cnf_matrix, classes=['Not Relevant','Relevant'],normalize=True,\n", " title='Confusion matrix with normalization')" ], - "execution_count": 0, + "execution_count": 12, "outputs": [ { "output_type": "stream", @@ -670,7 +682,7 @@ { "output_type": "stream", "text": [ - "Accuracy: 0.6871871871871872\n", + "Accuracy: 0.6861861861861862\n", "AUC: 0.7251117679464362\n" ], "name": "stdout" @@ -678,7 +690,7 @@ { "output_type": "display_data", "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -699,6 +711,19 @@ "source": [ "So, how do we choose whats the best? If we look at overall accuracy alone, we should be choosing the very first classifier in this notebook. However, that is also doing poorly with identifying \"relevant\" articles. If we choose purely based on how good it is doing with \"relevant\" category, we should choose the second one we built. If we choose purely based on how good it is doing with \"irrelevant\" category, surely, nothing beats not building any classifier and just calling everything irrelevant! So, what to choose as the best among these depends on what we are looking for in our usecase! " ] + }, + { + "cell_type": "code", + "metadata": { + "id": "AcZ_zv34JUQt", + "colab_type": "code", + "colab": {} + }, + "source": [ + "" + ], + "execution_count": 12, + "outputs": [] } ] -} +} \ No newline at end of file diff --git a/Ch4/02_Doc2Vec_Example.ipynb b/Ch4/02_Doc2Vec_Example.ipynb index 546569b..ea63f5b 100644 --- a/Ch4/02_Doc2Vec_Example.ipynb +++ b/Ch4/02_Doc2Vec_Example.ipynb @@ -1,375 +1,474 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "LCgVnQopb6TI" - }, - "source": [ - "# Doc2Vec demonstration \n", - "\n", - "In this notebook, let us take a look at how to \"learn\" document embeddings and use them for text classification. We will be using the dataset of \"Sentiment and Emotion in Text\" from [Kaggle](https://www.kaggle.com/c/sa-emotions/data).\n", - "\n", - "\"In a variation on the popular task of sentiment analysis, this dataset contains labels for the emotional content (such as happiness, sadness, and anger) of texts. Hundreds to thousands of examples across 13 labels. A subset of this data is used in an experiment we uploaded to Microsoft’s Cortana Intelligence Gallery.\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "hSB6W1seb6TJ" - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from nltk.tokenize import TweetTokenizer\n", - "from nltk.corpus import stopwords\n", - "from sklearn.model_selection import train_test_split\n", - "from gensim.models.doc2vec import Doc2Vec, TaggedDocument" - ] + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Doc2Vec_Example.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "lSvnHBYPb6TQ", - "outputId": "76752204-4f72-4ab3-f783-44b1d854173d" - }, - "outputs": [ + "cells": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "(40000, 4)\n" - ] + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "LCgVnQopb6TI" + }, + "source": [ + "# Doc2Vec demonstration \n", + "\n", + "In this notebook, let us take a look at how to \"learn\" document embeddings and use them for text classification. We will be using the dataset of \"Sentiment and Emotion in Text\" from [Kaggle](https://www.kaggle.com/c/sa-emotions/data).\n", + "\n", + "\"In a variation on the popular task of sentiment analysis, this dataset contains labels for the emotional content (such as happiness, sadness, and anger) of texts. Hundreds to thousands of examples across 13 labels. A subset of this data is used in an experiment we uploaded to Microsoft’s Cortana Intelligence Gallery.\"\n" + ] }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tweet_idsentimentauthorcontent
01956967341emptyxoshayzers@tiffanylue i know i was listenin to bad habi...
11956967666sadnesswannamamaLayin n bed with a headache ughhhh...waitin o...
21956967696sadnesscoolfunkyFuneral ceremony...gloomy friday...
31956967789enthusiasmczareaquinowants to hang out with friends SOON!
41956968416neutralxkilljoyx@dannycastillo We want to trade with someone w...
\n", - "
" + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "hSB6W1seb6TJ", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 50 + }, + "outputId": "fa6730c6-06df-46ce-91c8-cd59211d24de" + }, + "source": [ + "import pandas as pd\n", + "import nltk\n", + "nltk.download('stopwords')\n", + "from nltk.tokenize import TweetTokenizer\n", + "from nltk.corpus import stopwords\n", + "from sklearn.model_selection import train_test_split\n", + "from gensim.models.doc2vec import Doc2Vec, TaggedDocument" ], - "text/plain": [ - " tweet_id sentiment author \\\n", - "0 1956967341 empty xoshayzers \n", - "1 1956967666 sadness wannamama \n", - "2 1956967696 sadness coolfunky \n", - "3 1956967789 enthusiasm czareaquino \n", - "4 1956968416 neutral xkilljoyx \n", - "\n", - " content \n", - "0 @tiffanylue i know i was listenin to bad habi... \n", - "1 Layin n bed with a headache ughhhh...waitin o... \n", - "2 Funeral ceremony...gloomy friday... \n", - "3 wants to hang out with friends SOON! \n", - "4 @dannycastillo We want to trade with someone w... " + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 176, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "#Load the dataset and explore.\n", - "filepath = \"DATASET_PATH\"\n", - "df = pd.read_csv(filepath)\n", - "print(df.shape)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "5JEI6SH7b6TU", - "outputId": "1518cce3-afd2-48e2-ac51-c2c314d7329a" - }, - "outputs": [ + }, { - "data": { - "text/plain": [ - "neutral 8638\n", - "worry 8459\n", - "happiness 5209\n", - "sadness 5165\n", - "love 3842\n", - "surprise 2187\n", - "fun 1776\n", - "relief 1526\n", - "hate 1323\n", - "empty 827\n", - "enthusiasm 759\n", - "boredom 179\n", - "anger 110\n", - "Name: sentiment, dtype: int64" + "cell_type": "code", + "metadata": { + "id": "Fnv168QRJjxi", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 470 + }, + "outputId": "a99d7d74-d248-4fa0-bf66-916283ffd401" + }, + "source": [ + "#downloding data\n", + "!wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv\n", + "!wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv\n", + "!ls -lah DATAPATH" + ], + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "text": [ + "--2020-08-10 11:36:06-- https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 2479133 (2.4M) [text/plain]\n", + "Saving to: ‘DATAPATH/train_data.csv’\n", + "\n", + "\rtrain_data.csv 0%[ ] 0 --.-KB/s \rtrain_data.csv 100%[===================>] 2.36M 14.7MB/s in 0.2s \n", + "\n", + "2020-08-10 11:36:07 (14.7 MB/s) - ‘DATAPATH/train_data.csv’ saved [2479133/2479133]\n", + "\n", + "--2020-08-10 11:36:08-- https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 783640 (765K) [text/plain]\n", + "Saving to: ‘DATAPATH/test_data.csv’\n", + "\n", + "test_data.csv 100%[===================>] 765.27K --.-KB/s in 0.1s \n", + "\n", + "2020-08-10 11:36:09 (6.78 MB/s) - ‘DATAPATH/test_data.csv’ saved [783640/783640]\n", + "\n", + "total 3.2M\n", + "drwxr-xr-x 2 root root 4.0K Aug 10 11:36 .\n", + "drwxr-xr-x 1 root root 4.0K Aug 10 11:36 ..\n", + "-rw-r--r-- 1 root root 766K Aug 10 11:36 test_data.csv\n", + "-rw-r--r-- 1 root root 2.4M Aug 10 11:36 train_data.csv\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 177, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "df['sentiment'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "CHajyKpmb6TY", - "outputId": "46661693-ae70-46cc-8a40-ab8196b5069b" - }, - "outputs": [ + }, { - "data": { - "text/plain": [ - "(22306, 4)" + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "lSvnHBYPb6TQ", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 212 + }, + "outputId": "e2aac8d5-ef66-4e02-9949-32434f8cb537" + }, + "source": [ + "#Load the dataset and explore.\n", + "filepath = \"DATAPATH/train_data.csv\"\n", + "df = pd.read_csv(filepath)\n", + "print(df.shape)\n", + "df.head()" + ], + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "text": [ + "(30000, 2)\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentimentcontent
0empty@tiffanylue i know i was listenin to bad habi...
1sadnessLayin n bed with a headache ughhhh...waitin o...
2sadnessFuneral ceremony...gloomy friday...
3enthusiasmwants to hang out with friends SOON!
4neutral@dannycastillo We want to trade with someone w...
\n", + "
" + ], + "text/plain": [ + " sentiment content\n", + "0 empty @tiffanylue i know i was listenin to bad habi...\n", + "1 sadness Layin n bed with a headache ughhhh...waitin o...\n", + "2 sadness Funeral ceremony...gloomy friday...\n", + "3 enthusiasm wants to hang out with friends SOON!\n", + "4 neutral @dannycastillo We want to trade with someone w..." + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 2 + } ] - }, - "execution_count": 178, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "#Let us take the top 3 categories and leave out the rest.\n", - "shortlist = ['neutral', \"happiness\", \"worry\"]\n", - "df_subset = df[df['sentiment'].isin(shortlist)]\n", - "df_subset.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "m2oiZzU5b6Tf" - }, - "source": [ - "# Text pre-processing:\n", - "Tweets are different. Somethings to consider:\n", - "- Removing @mentions, and urls perhaps?\n", - "- using NLTK Tweet tokenizer instead of a regular one\n", - "- stopwords, numbers as usual." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "Rl-FfMdLb6Th", - "outputId": "1688361f-c82f-4b0f-9a49-fa6e415f2095" - }, - "outputs": [ + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "22306 22306\n" - ] - } - ], - "source": [ - "#strip_handles removes personal information such as twitter handles, which don't\n", - "#contribute to emotion in the tweet. preserve_case=False converts everything to lowercase.\n", - "tweeter = TweetTokenizer(strip_handles=True,preserve_case=False)\n", - "mystopwords = set(stopwords.words(\"english\"))\n", - "\n", - "#Function to tokenize tweets, remove stopwords and numbers. \n", - "#Keeping punctuations and emoticon symbols could be relevant for this task!\n", - "def preprocess_corpus(texts):\n", - " def remove_stops_digits(tokens):\n", - " #Nested function that removes stopwords and digits from a list of tokens\n", - " return [token for token in tokens if token not in mystopwords and not token.isdigit()]\n", - " #This return statement below uses the above function to process twitter tokenizer output further. \n", - " return [remove_stops_digits(tweeter.tokenize(content)) for content in texts]\n", - "\n", - "#df_subset contains only the three categories we chose. \n", - "mydata = preprocess_corpus(df_subset['content'])\n", - "mycats = df_subset['sentiment']\n", - "print(len(mydata), len(mycats))" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "rsGwfVebb6Tl", - "outputId": "a39c6870-ef5b-45f4-b574-d94862ae1922" - }, - "outputs": [ + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "5JEI6SH7b6TU", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 252 + }, + "outputId": "22cc98a5-90d0-49c9-fb58-f40d743963e9" + }, + "source": [ + "df['sentiment'].value_counts()" + ], + "execution_count": 3, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "worry 7433\n", + "neutral 6340\n", + "sadness 4828\n", + "happiness 2986\n", + "love 2068\n", + "surprise 1613\n", + "hate 1187\n", + "fun 1088\n", + "relief 1021\n", + "empty 659\n", + "enthusiasm 522\n", + "boredom 157\n", + "anger 98\n", + "Name: sentiment, dtype: int64" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 3 + } + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model Saved\n" - ] - } - ], - "source": [ - "#Split data into train and test, following the usual process\n", - "train_data, test_data, train_cats, test_cats = train_test_split(mydata,mycats,random_state=1234)\n", - "\n", - "#prepare training data in doc2vec format:\n", - "train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(train_data)]\n", - "#Train a doc2vec model to learn tweet representations. Use only training data!!\n", - "model = Doc2Vec(vector_size=50, alpha=0.025, min_count=10, dm =1, epochs=100)\n", - "model.build_vocab(train_doc2vec)\n", - "model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)\n", - "model.save(\"d2v.model\")\n", - "print(\"Model Saved\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "hTqo26Vsb6Ts", - "outputId": "1fadf3cc-1238-4512-9d50-2b1402f0fe8c" - }, - "outputs": [ + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "CHajyKpmb6TY", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "8749211d-1a7c-43c9-bf40-d4d22e74407a" + }, + "source": [ + "#Let us take the top 3 categories and leave out the rest.\n", + "shortlist = ['neutral', \"happiness\", \"worry\"]\n", + "df_subset = df[df['sentiment'].isin(shortlist)]\n", + "df_subset.shape" + ], + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(16759, 2)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 4 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "m2oiZzU5b6Tf" + }, + "source": [ + "# Text pre-processing:\n", + "Tweets are different. Somethings to consider:\n", + "- Removing @mentions, and urls perhaps?\n", + "- using NLTK Tweet tokenizer instead of a regular one\n", + "- stopwords, numbers as usual." + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "Rl-FfMdLb6Th", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "6273576c-0495-4606-da02-a7d06358ab2d" + }, + "source": [ + "#strip_handles removes personal information such as twitter handles, which don't\n", + "#contribute to emotion in the tweet. preserve_case=False converts everything to lowercase.\n", + "tweeter = TweetTokenizer(strip_handles=True,preserve_case=False)\n", + "mystopwords = set(stopwords.words(\"english\"))\n", + "\n", + "#Function to tokenize tweets, remove stopwords and numbers. \n", + "#Keeping punctuations and emoticon symbols could be relevant for this task!\n", + "def preprocess_corpus(texts):\n", + " def remove_stops_digits(tokens):\n", + " #Nested function that removes stopwords and digits from a list of tokens\n", + " return [token for token in tokens if token not in mystopwords and not token.isdigit()]\n", + " #This return statement below uses the above function to process twitter tokenizer output further. \n", + " return [remove_stops_digits(tweeter.tokenize(content)) for content in texts]\n", + "\n", + "#df_subset contains only the three categories we chose. \n", + "mydata = preprocess_corpus(df_subset['content'])\n", + "mycats = df_subset['sentiment']\n", + "print(len(mydata), len(mycats))" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "text": [ + "16759 16759\n" + ], + "name": "stdout" + } + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " happiness 0.51 0.47 0.49 1331\n", - " neutral 0.47 0.58 0.52 2143\n", - " worry 0.55 0.44 0.49 2103\n", - "\n", - "avg / total 0.51 0.50 0.50 5577\n", - "\n" - ] + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "rsGwfVebb6Tl", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 87 + }, + "outputId": "ce668f41-578a-467e-d3e7-20ea66b53c6d" + }, + "source": [ + "#Split data into train and test, following the usual process\n", + "train_data, test_data, train_cats, test_cats = train_test_split(mydata,mycats,random_state=1234)\n", + "\n", + "#prepare training data in doc2vec format:\n", + "train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(train_data)]\n", + "#Train a doc2vec model to learn tweet representations. Use only training data!!\n", + "model = Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dm =1, epochs=100)\n", + "model.build_vocab(train_doc2vec)\n", + "model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)\n", + "model.save(\"d2v.model\")\n", + "print(\"Model Saved\")" + ], + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Model Saved\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:254: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n", + " 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "hTqo26Vsb6Ts", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 238 + }, + "outputId": "13f5218a-a22d-400d-bd9e-d53a51c767d7" + }, + "source": [ + "#Infer the feature representation for training and test data using the trained model\n", + "model= Doc2Vec.load(\"d2v.model\")\n", + "#infer in multiple steps to get a stable representation. \n", + "train_vectors = [model.infer_vector(list_of_tokens, steps=50) for list_of_tokens in train_data]\n", + "test_vectors = [model.infer_vector(list_of_tokens, steps=50) for list_of_tokens in test_data]\n", + "\n", + "#Use any regular classifier like logistic regression\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "myclass = LogisticRegression(class_weight=\"balanced\") #because classes are not balanced. \n", + "myclass.fit(train_vectors, train_cats)\n", + "\n", + "preds = myclass.predict(test_vectors)\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "print(classification_report(test_cats, preds))\n", + "\n", + "#print(confusion_matrix(test_cats,preds))\n" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:254: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n", + " 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " happiness 0.35 0.53 0.42 713\n", + " neutral 0.48 0.55 0.51 1595\n", + " worry 0.61 0.40 0.48 1882\n", + "\n", + " accuracy 0.48 4190\n", + " macro avg 0.48 0.50 0.47 4190\n", + "weighted avg 0.51 0.48 0.48 4190\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "FQuoinmiK_jG", + "colab_type": "code", + "colab": {} + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] } - ], - "source": [ - "#Infer the feature representation for training and test data using the trained model\n", - "model= Doc2Vec.load(\"d2v.model\")\n", - "#infer in multiple steps to get a stable representation. \n", - "train_vectors = [model.infer_vector(list_of_tokens, steps=50) for list_of_tokens in train_data]\n", - "test_vectors = [model.infer_vector(list_of_tokens, steps=50) for list_of_tokens in test_data]\n", - "\n", - "#Use any regular classifier like logistic regression\n", - "from sklearn.linear_model import LogisticRegression\n", - "\n", - "myclass = LogisticRegression(class_weight=\"balanced\") #because classes are not balanced. \n", - "myclass.fit(train_vectors, train_cats)\n", - "\n", - "preds = myclass.predict(test_vectors)\n", - "from sklearn.metrics import classification_report, confusion_matrix\n", - "print(classification_report(test_cats, preds))\n", - "\n", - "#print(confusion_matrix(test_cats,preds))\n" - ] - } - ], - "metadata": { - "colab": { - "name": "Doc2Vec_Example.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.10" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} + ] +} \ No newline at end of file diff --git a/Ch4/04_FastText_Example.ipynb b/Ch4/04_FastText_Example.ipynb index b88d33f..8a98d80 100644 --- a/Ch4/04_FastText_Example.ipynb +++ b/Ch4/04_FastText_Example.ipynb @@ -1,390 +1,542 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "6FIToZHAhz2O" - }, - "source": [ - "In this notebook we will demonstrate using the fastText library to perform text classificatoin on the dbpedie data which can we downloaded from [here](https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz).
fastText is a library for learning of word embeddings and text classification created by Facebook's AI Research (FAIR) lab. The model allows to create an unsupervised learning or supervised learning algorithm for obtaining vector representations for words. Facebook makes available pretrained models for 294 languages(source: [wiki](https://en.wikipedia.org/wiki/FastText)).
\n", - "**Note**: This notebook uses an older version of fasttext." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "YKgZXvTGb61z" - }, - "outputs": [], - "source": [ - "#necessary imports\n", - "import pandas as pd\n", - "from time import time\n", - "from fasttext import supervised " - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "lMoRw3oQb62I", - "outputId": "370aaaab-6613-4631-bb88-e00d746a5100" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train:(560000, 3) Test:(70000, 3)\n" - ] + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "FastText_Example.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" } - ], - "source": [ - "data_path = 'DATAFOLDER_PATH'\n", - "# Loading train data\n", - "train_file = data_path + 'dbpedia_csv/train.csv'\n", - "df = pd.read_csv(train_file, header=None, names=['class','name','description'])\n", - "# Loading test data\n", - "test_file = data_path + 'dbpedia_csv/test.csv'\n", - "df_test = pd.read_csv(test_file, header=None, names=['class','name','description'])\n", - "# Data we have\n", - "print(\"Train:{} Test:{}\".format(df.shape,df_test.shape))\n" - ] }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "gaz226vXb62W", - "outputId": "cbe39521-6bd0-4d5f-ddf1-bae279784d16" - }, - "outputs": [ + "cells": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
classnamedescriptionclass_name
01E. D. Abbott LtdAbbott of Farnham E D Abbott Limited was a Br...Company
11Schwan-StabiloSchwan-STABILO is a German maker of pens for ...Company
21Q-workshopQ-workshop is a Polish company located in Poz...Company
31Marvell Software Solutions IsraelMarvell Software Solutions Israel known as RA...Company
41Bergan Mercy Medical CenterBergan Mercy Medical Center is a hospital loc...Company
\n", - "
" + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "6FIToZHAhz2O" + }, + "source": [ + "In this notebook we will demonstrate using the fastText library to perform text classificatoin on the dbpedie data which can we downloaded from [here](https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz).
fastText is a library for learning of word embeddings and text classification created by Facebook's AI Research (FAIR) lab. The model allows to create an unsupervised learning or supervised learning algorithm for obtaining vector representations for words. Facebook makes available pretrained models for 294 languages(source: [wiki](https://en.wikipedia.org/wiki/FastText)).
\n", + "**Note**: This notebook uses an older version of fasttext." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xC9f1uA-OX8J", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "outputId": "02dfd2ba-06ee-403a-dde6-a2b3ae61014c" + }, + "source": [ + "!pip install fasttext==0.9.2" ], - "text/plain": [ - " class name \\\n", - "0 1 E. D. Abbott Ltd \n", - "1 1 Schwan-Stabilo \n", - "2 1 Q-workshop \n", - "3 1 Marvell Software Solutions Israel \n", - "4 1 Bergan Mercy Medical Center \n", - "\n", - " description class_name \n", - "0 Abbott of Farnham E D Abbott Limited was a Br... Company \n", - "1 Schwan-STABILO is a German maker of pens for ... Company \n", - "2 Q-workshop is a Polish company located in Poz... Company \n", - "3 Marvell Software Solutions Israel known as RA... Company \n", - "4 Bergan Mercy Medical Center is a hospital loc... Company " + "execution_count": 15, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Collecting fasttext==0.9.2\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)\n", + "\r\u001b[K |████▊ | 10kB 17.9MB/s eta 0:00:01\r\u001b[K |█████████▌ | 20kB 1.7MB/s eta 0:00:01\r\u001b[K |██████████████▎ | 30kB 2.2MB/s eta 0:00:01\r\u001b[K |███████████████████ | 40kB 2.5MB/s eta 0:00:01\r\u001b[K |███████████████████████▉ | 51kB 2.0MB/s eta 0:00:01\r\u001b[K |████████████████████████████▋ | 61kB 2.2MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 71kB 1.9MB/s \n", + "\u001b[?25hRequirement already satisfied: pybind11>=2.2 in /usr/local/lib/python3.6/dist-packages (from fasttext==0.9.2) (2.5.0)\n", + "Requirement already satisfied: setuptools>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from fasttext==0.9.2) (49.2.0)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from fasttext==0.9.2) (1.18.5)\n", + "Building wheels for collected packages: fasttext\n", + " Building wheel for fasttext (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3020301 sha256=b65b73f96edcad8906e88913fc050bb28348ae2d57360e7df66ab675a73822e0\n", + " Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c154b75231136cc3a3321ab0e30f592\n", + "Successfully built fasttext\n", + "Installing collected packages: fasttext\n", + "Successfully installed fasttext-0.9.2\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 64, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "# Since we have no clue about the classes lets build one\n", - "# Mapping from class number to class name\n", - "class_dict={\n", - " 1:'Company',\n", - " 2:'EducationalInstitution',\n", - " 3:'Artist',\n", - " 4:'Athlete',\n", - " 5:'OfficeHolder',\n", - " 6:'MeanOfTransportation',\n", - " 7:'Building',\n", - " 8:'NaturalPlace',\n", - " 9:'Village',\n", - " 10:'Animal',\n", - " 11:'Plant',\n", - " 12:'Album',\n", - " 13:'Film',\n", - " 14:'WrittenWork'\n", - " }\n", - "\n", - "# Mapping the classes\n", - "df['class_name'] = df['class'].map(class_dict)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "si7VC_Rub62a", - "outputId": "78f5b32b-79e5-4a37-d855-ce16219244be" - }, - "outputs": [ + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "YKgZXvTGb61z", + "colab": {} + }, + "source": [ + "#necessary imports\n", + "import pandas as pd" + ], + "execution_count": 1, + "outputs": [] + }, { - "data": { - "text/plain": [ - "NaturalPlace 40000\n", - "Athlete 40000\n", - "MeanOfTransportation 40000\n", - "Company 40000\n", - "Animal 40000\n", - "Album 40000\n", - "Village 40000\n", - "Plant 40000\n", - "WrittenWork 40000\n", - "Film 40000\n", - "OfficeHolder 40000\n", - "Building 40000\n", - "EducationalInstitution 40000\n", - "Artist 40000\n", - "Name: class_name, dtype: int64" + "cell_type": "code", + "metadata": { + "id": "l6CfW7C3L4EB", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 588 + }, + "outputId": "f9708301-c566-4094-ad24-ecee460052db" + }, + "source": [ + "# downloading the data\n", + "!wget -P DATAPATH https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz\n", + "\n", + "# untaring the reuqired file\n", + "!tar -xvf DATAPATH/dbpedia_csv.tar.gz -C DATAPATH\n", + "\n", + "# sneek peek in the folder structure\n", + "!ls -lah DATAPATH\n" + ], + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "text": [ + "--2020-08-10 15:31:17-- https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz\n", + "Resolving github.com (github.com)... 140.82.114.4\n", + "Connecting to github.com (github.com)|140.82.114.4|:443... connected.\n", + "HTTP request sent, awaiting response... 301 Moved Permanently\n", + "Location: https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz [following]\n", + "--2020-08-10 15:31:17-- https://github.com/srhrshr/torchDatasets/raw/master/dbpedia_csv.tar.gz\n", + "Reusing existing connection to github.com:443.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://raw.githubusercontent.com/srhrshr/torchDatasets/master/dbpedia_csv.tar.gz [following]\n", + "--2020-08-10 15:31:17-- https://raw.githubusercontent.com/srhrshr/torchDatasets/master/dbpedia_csv.tar.gz\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... failed: Connection timed out.\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.64.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 68431223 (65M) [application/octet-stream]\n", + "Saving to: ‘DATAPATH/dbpedia_csv.tar.gz.1’\n", + "\n", + "dbpedia_csv.tar.gz. 100%[===================>] 65.26M 81.3MB/s in 0.8s \n", + "\n", + "2020-08-10 15:31:50 (81.3 MB/s) - ‘DATAPATH/dbpedia_csv.tar.gz.1’ saved [68431223/68431223]\n", + "\n", + "dbpedia_csv/\n", + "dbpedia_csv/test.csv\n", + "dbpedia_csv/classes.txt\n", + "dbpedia_csv/train.csv\n", + "dbpedia_csv/readme.txt\n", + "total 328M\n", + "drwxr-xr-x 3 root root 4.0K Aug 10 15:31 .\n", + "drwxr-xr-x 1 root root 4.0K Aug 10 12:14 ..\n", + "drwxrwxr-x 2 1000 1000 4.0K Mar 29 2015 dbpedia_csv\n", + "-rw-r--r-- 1 root root 66M Aug 10 12:14 dbpedia_csv.tar.gz\n", + "-rw-r--r-- 1 root root 66M Aug 10 15:31 dbpedia_csv.tar.gz.1\n", + "-rw-r--r-- 1 root root 22M Aug 10 12:15 dbpedia_test.csv\n", + "-rw-r--r-- 1 root root 175M Aug 10 12:15 dbpedia_train.csv\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 65, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"class_name\"].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "Sn-3kIqMb62d" - }, - "outputs": [], - "source": [ - "# Lets do some cleaning of this text\n", - "def clean_it(text,normalize=True):\n", - " # Replacing possible issues with data. We can add or reduce the replacemtent in this chain\n", - " s = str(text).replace(',',' ').replace('\"','').replace('\\'',' \\' ').replace('.',' . ').replace('(',' ( ').\\\n", - " replace(')',' ) ').replace('!',' ! ').replace('?',' ? ').replace(':',' ').replace(';',' ').lower()\n", - " \n", - " # normalizing / encoding the text\n", - " if normalize:\n", - " s = s.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8')\n", - " \n", - " return s\n", - "\n", - "# Now lets define a small function where we can use above cleaning on datasets\n", - "def clean_df(data, cleanit= False, shuffleit=False, encodeit=False, label_prefix='__class__'):\n", - " # Defining the new data\n", - " df = data[['name','description']].copy(deep=True)\n", - " df['class'] = label_prefix + data['class'].astype(str) + ' '\n", - " \n", - " # cleaning it\n", - " if cleanit:\n", - " df['name'] = df['name'].apply(lambda x: clean_it(x,encodeit))\n", - " df['description'] = df['description'].apply(lambda x: clean_it(x,encodeit))\n", - " \n", - " # shuffling it\n", - " if shuffleit:\n", - " df.sample(frac=1).reset_index(drop=True)\n", - " \n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "r_DRvdFcb62m", - "outputId": "ba503528-7d01-438b-f7fb-2879874e0a0c" - }, - "outputs": [ + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 4.44 s, sys: 68.1 ms, total: 4.51 s\n", - "Wall time: 4.51 s\n" - ] - } - ], - "source": [ - "%%time\n", - "# Transform the datasets using the above clean functions\n", - "df_train_cleaned = clean_df(df, True, True)\n", - "df_test_cleaned = clean_df(df_test, True, True)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "imMZ9-Bkb62t" - }, - "outputs": [], - "source": [ - "# Write files to disk as fastText classifier API reads files from disk.\n", - "train_file = data_path + 'dbpedia_train.csv'\n", - "df_train_cleaned.to_csv(train_file, header=None, index=False, columns=['class','name','description'] )\n", - "\n", - "test_file = data_path + 'dbpedia_test.csv'\n", - "df_test_cleaned.to_csv(test_file, header=None, index=False, columns=['class','name','description'] )\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "bWZTSzd9b62x" - }, - "source": [ - "Now that we have the train and test files written into disk in a format fastText wants, we are ready to use it for text classification!" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "a-H1wouCb62x", - "outputId": "f85a632e-d2b2-4b60-d0e2-93896645947c" - }, - "outputs": [ + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "lMoRw3oQb62I", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "4a9015af-de57-41f3-d932-fa255e48063f" + }, + "source": [ + "data_path = 'DATAPATH'\n", + "# Loading train data\n", + "train_file = data_path + '/dbpedia_csv/train.csv'\n", + "df = pd.read_csv(train_file, header=None, names=['class','name','description'])\n", + "# Loading test data\n", + "test_file = data_path + '/dbpedia_csv/test.csv'\n", + "df_test = pd.read_csv(test_file, header=None, names=['class','name','description'])\n", + "# Data we have\n", + "print(\"Train:{} Test:{}\".format(df.shape,df_test.shape))\n" + ], + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Train:(560000, 3) Test:(70000, 3)\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "gaz226vXb62W", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 195 + }, + "outputId": "9136af68-a5d9-4041-d13c-52d26c38a59c" + }, + "source": [ + "# Since we have no clue about the classes lets build one\n", + "# Mapping from class number to class name\n", + "class_dict={\n", + " 1:'Company',\n", + " 2:'EducationalInstitution',\n", + " 3:'Artist',\n", + " 4:'Athlete',\n", + " 5:'OfficeHolder',\n", + " 6:'MeanOfTransportation',\n", + " 7:'Building',\n", + " 8:'NaturalPlace',\n", + " 9:'Village',\n", + " 10:'Animal',\n", + " 11:'Plant',\n", + " 12:'Album',\n", + " 13:'Film',\n", + " 14:'WrittenWork'\n", + " }\n", + "\n", + "# Mapping the classes\n", + "df['class_name'] = df['class'].map(class_dict)\n", + "df.head()" + ], + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
classnamedescriptionclass_name
01E. D. Abbott LtdAbbott of Farnham E D Abbott Limited was a Br...Company
11Schwan-StabiloSchwan-STABILO is a German maker of pens for ...Company
21Q-workshopQ-workshop is a Polish company located in Poz...Company
31Marvell Software Solutions IsraelMarvell Software Solutions Israel known as RA...Company
41Bergan Mercy Medical CenterBergan Mercy Medical Center is a hospital loc...Company
\n", + "
" + ], + "text/plain": [ + " class ... class_name\n", + "0 1 ... Company\n", + "1 1 ... Company\n", + "2 1 ... Company\n", + "3 1 ... Company\n", + "4 1 ... Company\n", + "\n", + "[5 rows x 4 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "si7VC_Rub62a", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 269 + }, + "outputId": "8c8e6220-8028-460c-8042-95b9fa25152b" + }, + "source": [ + "df[\"class_name\"].value_counts()" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Athlete 40000\n", + "Animal 40000\n", + "MeanOfTransportation 40000\n", + "Artist 40000\n", + "OfficeHolder 40000\n", + "Building 40000\n", + "Plant 40000\n", + "WrittenWork 40000\n", + "EducationalInstitution 40000\n", + "Village 40000\n", + "NaturalPlace 40000\n", + "Company 40000\n", + "Film 40000\n", + "Album 40000\n", + "Name: class_name, dtype: int64" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "Sn-3kIqMb62d", + "colab": {} + }, + "source": [ + "# Lets do some cleaning of this text\n", + "def clean_it(text,normalize=True):\n", + " # Replacing possible issues with data. We can add or reduce the replacemtent in this chain\n", + " s = str(text).replace(',',' ').replace('\"','').replace('\\'',' \\' ').replace('.',' . ').replace('(',' ( ').\\\n", + " replace(')',' ) ').replace('!',' ! ').replace('?',' ? ').replace(':',' ').replace(';',' ').lower()\n", + " \n", + " # normalizing / encoding the text\n", + " if normalize:\n", + " s = s.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8')\n", + " \n", + " return s\n", + "\n", + "# Now lets define a small function where we can use above cleaning on datasets\n", + "def clean_df(data, cleanit= False, shuffleit=False, encodeit=False, label_prefix='__class__'):\n", + " # Defining the new data\n", + " df = data[['name','description']].copy(deep=True)\n", + " df['class'] = label_prefix + data['class'].astype(str) + ' '\n", + " \n", + " # cleaning it\n", + " if cleanit:\n", + " df['name'] = df['name'].apply(lambda x: clean_it(x,encodeit))\n", + " df['description'] = df['description'].apply(lambda x: clean_it(x,encodeit))\n", + " \n", + " # shuffling it\n", + " if shuffleit:\n", + " df.sample(frac=1).reset_index(drop=True)\n", + " \n", + " return df" + ], + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "r_DRvdFcb62m", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 50 + }, + "outputId": "140ec680-99f8-4f2d-c00f-ebbb1cd5e9ae" + }, + "source": [ + "%%time\n", + "# Transform the datasets using the above clean functions\n", + "df_train_cleaned = clean_df(df, True, True)\n", + "df_test_cleaned = clean_df(df_test, True, True)" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "text": [ + "CPU times: user 9 s, sys: 1.16 s, total: 10.2 s\n", + "Wall time: 10.5 s\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "imMZ9-Bkb62t", + "colab": {} + }, + "source": [ + "# Write files to disk as fastText classifier API reads files from disk.\n", + "train_file = data_path + '/dbpedia_train.csv'\n", + "df_train_cleaned.to_csv(train_file, header=None, index=False, columns=['class','name','description'] )\n", + "\n", + "test_file = data_path + '/dbpedia_test.csv'\n", + "df_test_cleaned.to_csv(test_file, header=None, index=False, columns=['class','name','description'] )\n" + ], + "execution_count": 8, + "outputs": [] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 56.5 s, sys: 1.51 s, total: 58 s\n", - "Wall time: 12.6 s\n", - "70000 0.9710571428571428 0.9710571428571428\n" - ] + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "bWZTSzd9b62x" + }, + "source": [ + "Now that we have the train and test files written into disk in a format fastText wants, we are ready to use it for text classification!" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "a-H1wouCb62x", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + }, + "outputId": "c4f88ae8-0314-480e-e201-6a905a0b3fdb" + }, + "source": [ + "%%time\n", + "## Using fastText for feature extraction and training\n", + "from fasttext import train_supervised \n", + "\"\"\"fastText expects and training file (csv), a model name as input arguments.\n", + "label_prefix refers to the prefix before label string in the dataset.\n", + "default is __label__. In our dataset, it is __class__. \n", + "There are several other parameters which can be seen in: \n", + "https://pypi.org/project/fasttext/\n", + "\"\"\"\n", + "model = train_supervised(input=train_file, label=\"__class__\", lr=1.0, epoch=75, loss='ova', wordNgrams=2, dim=200, thread=2, verbose=100)" + ], + "execution_count": 18, + "outputs": [ + { + "output_type": "stream", + "text": [ + "CPU times: user 1h 30min 19s, sys: 25 s, total: 1h 30min 44s\n", + "Wall time: 46min 33s\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "sAyN3ZDbQFq-", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 106 + }, + "outputId": "7b121c71-6605-4e10-86e9-753a24ae02a1" + }, + "source": [ + "for k in range(1,6):\n", + " results = model.test(test_file,k=k)\n", + " print(f\"Test Samples: {results[0]} Precision@{k} : {results[1]*100:2.4f} Recall@{k} : {results[2]*100:2.4f}\")" + ], + "execution_count": 19, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Test Samples: 70000 Precision@1 : 93.2914 Recall@1 : 93.2914\n", + "Test Samples: 70000 Precision@2 : 48.7243 Recall@2 : 97.4486\n", + "Test Samples: 70000 Precision@3 : 32.6376 Recall@3 : 97.9129\n", + "Test Samples: 70000 Precision@4 : 24.6793 Recall@4 : 98.7171\n", + "Test Samples: 70000 Precision@5 : 19.8411 Recall@5 : 99.2057\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "nrxSYRs3b621" + }, + "source": [ + "Try training a classifier on this dataset with, say, LogisticRegression to realize how fast fastText is! 93% Precision and Recall are hard numbers to beat, too!" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Bp9w8RScruz7", + "colab_type": "code", + "colab": {} + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] } - ], - "source": [ - "## Using fastText for feature extraction and training\n", - "from fasttext import supervised \n", - "\"\"\"fastText expects and training file (csv), a model name as input arguments.\n", - "label_prefix refers to the prefix before label string in the dataset.\n", - "default is __label__. In our dataset, it is __class__. \n", - "There are several other parameters which can be seen in: \n", - "https://pypi.org/project/fasttext/\n", - "\"\"\"\n", - "%time model = supervised(train_file, 'temp', label_prefix=\"__class__\")\n", - "results = model.test(test_file)\n", - "print(results.nexamples, results.precision, results.recall)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "nrxSYRs3b621" - }, - "source": [ - "Try training a classifier on this dataset with, say, LogisticRegression to realize how fast fastText is! 97% Precision and Recall are hard numbers to beat, too!" - ] - } - ], - "metadata": { - "colab": { - "name": "FastText_Example.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.10" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} + ] +} \ No newline at end of file