Skip to content

Commit 47f04b8

Browse files
Merge pull request #73 from jatinpapreja/master
Updated Ch2 , Ch3, Ch4 and Ch10 to install dependencies using requirements files.
2 parents 223e993 + 4c1df95 commit 47f04b8

33 files changed

+9057
-7072
lines changed

Ch10/01_BioBERT_Demo.ipynb

+1,975-1,914
Large diffs are not rendered by default.

Ch10/02_LexNLP.ipynb

+383-174
Large diffs are not rendered by default.

Ch10/03_FinBERT.ipynb

+300-309
Large diffs are not rendered by default.

Ch10/ch10-requirements.txt

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
requests==2.23.0
2+
pytorch-transformers==1.2.0
3+
transformers==4.7.0
4+
pandas==1.1.5
5+
pytorch-pretrained-bert==0.6.2
6+
pytorch-nlp==0.5.0
7+
tensorflow==1.14.0
8+
torch==1.9.0
9+
keras==2.5.0
10+
scikit-learn==0.21.3
11+
tqdm==4.41.1
12+
matplotlib==3.2.2
13+
numpy==1.19.5
14+
nltk==3.5
15+
lexnlp==1.8.0
16+
textract==1.6.3
17+
wget==3.2

Ch2/01_WebScraping_using_BeautifulSoup.ipynb

+62-18
Original file line numberDiff line numberDiff line change
@@ -14,31 +14,75 @@
1414
{
1515
"cell_type": "code",
1616
"execution_count": 1,
17+
"metadata": {},
18+
"outputs": [
19+
{
20+
"name": "stdout",
21+
"output_type": "stream",
22+
"text": [
23+
"Requirement already satisfied: numpy==1.19.5 in c:\\users\\kumar apurv\\anaconda3\\envs\\ch2\\lib\\site-packages (1.19.5)\n",
24+
"Requirement already satisfied: beautifulsoup4==4.6.3 in c:\\users\\kumar apurv\\anaconda3\\envs\\ch2\\lib\\site-packages (4.6.3)\n"
25+
]
26+
}
27+
],
28+
"source": [
29+
"# To install only the requirements of this notebook, uncomment the lines below and run this cell\n",
30+
"\n",
31+
"# ===========================\n",
32+
"\n",
33+
"!pip install numpy==1.19.5\n",
34+
"!pip install beautifulsoup4==4.6.3\n",
35+
"\n",
36+
"# ==========================="
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": 2,
42+
"metadata": {},
43+
"outputs": [],
44+
"source": [
45+
"# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n",
46+
"\n",
47+
"# ===========================\n",
48+
"\n",
49+
"# try :\n",
50+
"# import google.colab\n",
51+
"# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch2/ch2-requirements.txt | xargs -n 1 -L 1 pip install\n",
52+
"# except ModuleNotFoundError :\n",
53+
"# !pip install -r \"ch2-requirements.txt\"\n",
54+
"\n",
55+
"# ==========================="
56+
]
57+
},
58+
{
59+
"cell_type": "code",
60+
"execution_count": 3,
1761
"metadata": {
1862
"colab": {},
1963
"colab_type": "code",
2064
"id": "P610gMZrd8SE"
2165
},
2266
"outputs": [],
2367
"source": [
24-
"#making the necessary imports\n",
68+
"# making the necessary imports\n",
2569
"from pprint import pprint\n",
2670
"from bs4 import BeautifulSoup\n",
2771
"from urllib.request import urlopen "
2872
]
2973
},
3074
{
3175
"cell_type": "code",
32-
"execution_count": 2,
76+
"execution_count": 4,
3377
"metadata": {
3478
"colab": {},
3579
"colab_type": "code",
3680
"id": "jfwgiGjJeBSG"
3781
},
3882
"outputs": [],
3983
"source": [
40-
"myurl = \"https://stackoverflow.com/questions/415511/how-to-get-the-current-time-in-python\" #specify the url\n",
41-
"html = urlopen(myurl).read() #query the website so that it returns a html page \n",
84+
"myurl = \"https://stackoverflow.com/questions/415511/how-to-get-the-current-time-in-python\" # specify the url\n",
85+
"html = urlopen(myurl).read() # query the website so that it returns a html page \n",
4286
"soupified = BeautifulSoup(html, 'html.parser') # parse the html in the 'html' variable, and store it in Beautiful Soup format"
4387
]
4488
},
@@ -51,16 +95,16 @@
5195
},
5296
{
5397
"cell_type": "code",
54-
"execution_count": 3,
98+
"execution_count": 5,
5599
"metadata": {},
56100
"outputs": [],
57101
"source": [
58-
"#pprint(soupified.prettify()) #for printing the full HTML structure of the webpage"
102+
"#pprint(soupified.prettify()) # for printing the full HTML structure of the webpage"
59103
]
60104
},
61105
{
62106
"cell_type": "code",
63-
"execution_count": 4,
107+
"execution_count": 6,
64108
"metadata": {
65109
"colab": {
66110
"base_uri": "https://localhost:8080/",
@@ -77,7 +121,7 @@
77121
"output_type": "stream",
78122
"text": [
79123
"('<!DOCTYPE html>\\n'\n",
80-
" '<html class=\"html__responsive\" itemscope=\"\" '\n",
124+
" '<html class=\"html__responsive html__fixed-top-bar\" itemscope=\"\" '\n",
81125
" 'itemtype=\"https://schema.org/QAPage\">\\n'\n",
82126
" ' <head>\\n'\n",
83127
" ' <title>\\n'\n",
@@ -117,21 +161,21 @@
117161
" ' <script '\n",
118162
" 'src=\"https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js\">\\n'\n",
119163
" ' </script>\\n'\n",
120-
" ' <script src=\"https://cdn.sstatic.net/Js/stub.en.js?v=b8a86b92f383\">\\n'\n",
164+
" ' <script src=\"https://cdn.sstatic.net/Js/stub.en.js?v=8eabfaaa0deb\">\\n'\n",
121165
" ' </script>\\n'\n",
122-
" ' <link href=\"https://cdn.sstatic.net/Shared/stacks.css?v=f0ad20c3c35c\" '\n",
166+
" ' <link href=\"https://cdn.sstatic.net/Shared/stacks.css?v=b4e52b95973a\" '\n",
123167
" 'rel=\"stylesheet\" type=\"text/css\"/>\\n'\n",
124-
" ' <link href=\"https://cdn.sstatic.ne')\n"
168+
" ' <link href=\"ht')\n"
125169
]
126170
}
127171
],
128172
"source": [
129-
"pprint(soupified.prettify()[:2000])#to get an idea of the html structure of the webpage "
173+
"pprint(soupified.prettify()[:2000]) # to get an idea of the html structure of the webpage "
130174
]
131175
},
132176
{
133177
"cell_type": "code",
134-
"execution_count": 5,
178+
"execution_count": 7,
135179
"metadata": {
136180
"colab": {
137181
"base_uri": "https://localhost:8080/",
@@ -148,18 +192,18 @@
148192
"<title>datetime - How to get the current time in Python - Stack Overflow</title>"
149193
]
150194
},
151-
"execution_count": 5,
195+
"execution_count": 7,
152196
"metadata": {},
153197
"output_type": "execute_result"
154198
}
155199
],
156200
"source": [
157-
"soupified.title #to get the title of the web page "
201+
"soupified.title # to get the title of the web page "
158202
]
159203
},
160204
{
161205
"cell_type": "code",
162-
"execution_count": 6,
206+
"execution_count": 8,
163207
"metadata": {
164208
"colab": {
165209
"base_uri": "https://localhost:8080/",
@@ -201,11 +245,11 @@
201245
}
202246
],
203247
"source": [
204-
"question = soupified.find(\"div\", {\"class\": \"question\"}) #find the nevessary tag and class which it belongs to\n",
248+
"question = soupified.find(\"div\", {\"class\": \"question\"}) # find the nevessary tag and class which it belongs to\n",
205249
"questiontext = question.find(\"div\", {\"class\": \"s-prose js-post-body\"})\n",
206250
"print(\"Question: \\n\", questiontext.get_text().strip())\n",
207251
"\n",
208-
"answer = soupified.find(\"div\", {\"class\": \"answer\"}) #find the nevessary tag and class which it belongs to\n",
252+
"answer = soupified.find(\"div\", {\"class\": \"answer\"}) # find the nevessary tag and class which it belongs to\n",
209253
"answertext = answer.find(\"div\", {\"class\": \"s-prose js-post-body\"})\n",
210254
"print(\"Best answer: \\n\", answertext.get_text().strip())"
211255
]

Ch2/03_Extracting_text_from_images_tesseract.ipynb

+69-10
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)