|
14 | 14 | {
|
15 | 15 | "cell_type": "code",
|
16 | 16 | "execution_count": 1,
|
| 17 | + "metadata": {}, |
| 18 | + "outputs": [ |
| 19 | + { |
| 20 | + "name": "stdout", |
| 21 | + "output_type": "stream", |
| 22 | + "text": [ |
| 23 | + "Requirement already satisfied: numpy==1.19.5 in c:\\users\\kumar apurv\\anaconda3\\envs\\ch2\\lib\\site-packages (1.19.5)\n", |
| 24 | + "Requirement already satisfied: beautifulsoup4==4.6.3 in c:\\users\\kumar apurv\\anaconda3\\envs\\ch2\\lib\\site-packages (4.6.3)\n" |
| 25 | + ] |
| 26 | + } |
| 27 | + ], |
| 28 | + "source": [ |
| 29 | + "# To install only the requirements of this notebook, uncomment the lines below and run this cell\n", |
| 30 | + "\n", |
| 31 | + "# ===========================\n", |
| 32 | + "\n", |
| 33 | + "!pip install numpy==1.19.5\n", |
| 34 | + "!pip install beautifulsoup4==4.6.3\n", |
| 35 | + "\n", |
| 36 | + "# ===========================" |
| 37 | + ] |
| 38 | + }, |
| 39 | + { |
| 40 | + "cell_type": "code", |
| 41 | + "execution_count": 2, |
| 42 | + "metadata": {}, |
| 43 | + "outputs": [], |
| 44 | + "source": [ |
| 45 | + "# To install the requirements for the entire chapter, uncomment the lines below and run this cell\n", |
| 46 | + "\n", |
| 47 | + "# ===========================\n", |
| 48 | + "\n", |
| 49 | + "# try :\n", |
| 50 | + "# import google.colab\n", |
| 51 | + "# !curl https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch2/ch2-requirements.txt | xargs -n 1 -L 1 pip install\n", |
| 52 | + "# except ModuleNotFoundError :\n", |
| 53 | + "# !pip install -r \"ch2-requirements.txt\"\n", |
| 54 | + "\n", |
| 55 | + "# ===========================" |
| 56 | + ] |
| 57 | + }, |
| 58 | + { |
| 59 | + "cell_type": "code", |
| 60 | + "execution_count": 3, |
17 | 61 | "metadata": {
|
18 | 62 | "colab": {},
|
19 | 63 | "colab_type": "code",
|
20 | 64 | "id": "P610gMZrd8SE"
|
21 | 65 | },
|
22 | 66 | "outputs": [],
|
23 | 67 | "source": [
|
24 |
| - "#making the necessary imports\n", |
| 68 | + "# making the necessary imports\n", |
25 | 69 | "from pprint import pprint\n",
|
26 | 70 | "from bs4 import BeautifulSoup\n",
|
27 | 71 | "from urllib.request import urlopen "
|
28 | 72 | ]
|
29 | 73 | },
|
30 | 74 | {
|
31 | 75 | "cell_type": "code",
|
32 |
| - "execution_count": 2, |
| 76 | + "execution_count": 4, |
33 | 77 | "metadata": {
|
34 | 78 | "colab": {},
|
35 | 79 | "colab_type": "code",
|
36 | 80 | "id": "jfwgiGjJeBSG"
|
37 | 81 | },
|
38 | 82 | "outputs": [],
|
39 | 83 | "source": [
|
40 |
| - "myurl = \"https://stackoverflow.com/questions/415511/how-to-get-the-current-time-in-python\" #specify the url\n", |
41 |
| - "html = urlopen(myurl).read() #query the website so that it returns a html page \n", |
| 84 | + "myurl = \"https://stackoverflow.com/questions/415511/how-to-get-the-current-time-in-python\" # specify the url\n", |
| 85 | + "html = urlopen(myurl).read() # query the website so that it returns a html page \n", |
42 | 86 | "soupified = BeautifulSoup(html, 'html.parser') # parse the html in the 'html' variable, and store it in Beautiful Soup format"
|
43 | 87 | ]
|
44 | 88 | },
|
|
51 | 95 | },
|
52 | 96 | {
|
53 | 97 | "cell_type": "code",
|
54 |
| - "execution_count": 3, |
| 98 | + "execution_count": 5, |
55 | 99 | "metadata": {},
|
56 | 100 | "outputs": [],
|
57 | 101 | "source": [
|
58 |
| - "#pprint(soupified.prettify()) #for printing the full HTML structure of the webpage" |
| 102 | + "#pprint(soupified.prettify()) # for printing the full HTML structure of the webpage" |
59 | 103 | ]
|
60 | 104 | },
|
61 | 105 | {
|
62 | 106 | "cell_type": "code",
|
63 |
| - "execution_count": 4, |
| 107 | + "execution_count": 6, |
64 | 108 | "metadata": {
|
65 | 109 | "colab": {
|
66 | 110 | "base_uri": "https://localhost:8080/",
|
|
77 | 121 | "output_type": "stream",
|
78 | 122 | "text": [
|
79 | 123 | "('<!DOCTYPE html>\\n'\n",
|
80 |
| - " '<html class=\"html__responsive\" itemscope=\"\" '\n", |
| 124 | + " '<html class=\"html__responsive html__fixed-top-bar\" itemscope=\"\" '\n", |
81 | 125 | " 'itemtype=\"https://schema.org/QAPage\">\\n'\n",
|
82 | 126 | " ' <head>\\n'\n",
|
83 | 127 | " ' <title>\\n'\n",
|
|
117 | 161 | " ' <script '\n",
|
118 | 162 | " 'src=\"https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js\">\\n'\n",
|
119 | 163 | " ' </script>\\n'\n",
|
120 |
| - " ' <script src=\"https://cdn.sstatic.net/Js/stub.en.js?v=b8a86b92f383\">\\n'\n", |
| 164 | + " ' <script src=\"https://cdn.sstatic.net/Js/stub.en.js?v=8eabfaaa0deb\">\\n'\n", |
121 | 165 | " ' </script>\\n'\n",
|
122 |
| - " ' <link href=\"https://cdn.sstatic.net/Shared/stacks.css?v=f0ad20c3c35c\" '\n", |
| 166 | + " ' <link href=\"https://cdn.sstatic.net/Shared/stacks.css?v=b4e52b95973a\" '\n", |
123 | 167 | " 'rel=\"stylesheet\" type=\"text/css\"/>\\n'\n",
|
124 |
| - " ' <link href=\"https://cdn.sstatic.ne')\n" |
| 168 | + " ' <link href=\"ht')\n" |
125 | 169 | ]
|
126 | 170 | }
|
127 | 171 | ],
|
128 | 172 | "source": [
|
129 |
| - "pprint(soupified.prettify()[:2000])#to get an idea of the html structure of the webpage " |
| 173 | + "pprint(soupified.prettify()[:2000]) # to get an idea of the html structure of the webpage " |
130 | 174 | ]
|
131 | 175 | },
|
132 | 176 | {
|
133 | 177 | "cell_type": "code",
|
134 |
| - "execution_count": 5, |
| 178 | + "execution_count": 7, |
135 | 179 | "metadata": {
|
136 | 180 | "colab": {
|
137 | 181 | "base_uri": "https://localhost:8080/",
|
|
148 | 192 | "<title>datetime - How to get the current time in Python - Stack Overflow</title>"
|
149 | 193 | ]
|
150 | 194 | },
|
151 |
| - "execution_count": 5, |
| 195 | + "execution_count": 7, |
152 | 196 | "metadata": {},
|
153 | 197 | "output_type": "execute_result"
|
154 | 198 | }
|
155 | 199 | ],
|
156 | 200 | "source": [
|
157 |
| - "soupified.title #to get the title of the web page " |
| 201 | + "soupified.title # to get the title of the web page " |
158 | 202 | ]
|
159 | 203 | },
|
160 | 204 | {
|
161 | 205 | "cell_type": "code",
|
162 |
| - "execution_count": 6, |
| 206 | + "execution_count": 8, |
163 | 207 | "metadata": {
|
164 | 208 | "colab": {
|
165 | 209 | "base_uri": "https://localhost:8080/",
|
|
201 | 245 | }
|
202 | 246 | ],
|
203 | 247 | "source": [
|
204 |
| - "question = soupified.find(\"div\", {\"class\": \"question\"}) #find the nevessary tag and class which it belongs to\n", |
| 248 | + "question = soupified.find(\"div\", {\"class\": \"question\"}) # find the nevessary tag and class which it belongs to\n", |
205 | 249 | "questiontext = question.find(\"div\", {\"class\": \"s-prose js-post-body\"})\n",
|
206 | 250 | "print(\"Question: \\n\", questiontext.get_text().strip())\n",
|
207 | 251 | "\n",
|
208 |
| - "answer = soupified.find(\"div\", {\"class\": \"answer\"}) #find the nevessary tag and class which it belongs to\n", |
| 252 | + "answer = soupified.find(\"div\", {\"class\": \"answer\"}) # find the nevessary tag and class which it belongs to\n", |
209 | 253 | "answertext = answer.find(\"div\", {\"class\": \"s-prose js-post-body\"})\n",
|
210 | 254 | "print(\"Best answer: \\n\", answertext.get_text().strip())"
|
211 | 255 | ]
|
|
0 commit comments