2
2
General functions for scraping data from Google search engine results pages.
3
3
"""
4
4
5
+ import re
5
6
import requests
6
7
import urllib .parse
7
8
import pandas as pd
@@ -22,8 +23,15 @@ def _get_source(url: str):
22
23
try :
23
24
session = HTMLSession ()
24
25
response = session .get (url )
25
- return response
26
26
27
+ if response .status_code == 200 :
28
+ return response
29
+ elif response .status_code == 429 :
30
+ print ('Error: Too many requests. Google has temporarily blocked you. Try again later.' )
31
+ exit ()
32
+ else :
33
+ print ('Error:' + response )
34
+ exit ()
27
35
except requests .exceptions .RequestException as e :
28
36
print (e )
29
37
@@ -38,10 +46,13 @@ def _get_site_results(url: str):
38
46
response (str): HTML of page.
39
47
"""
40
48
41
- query = urllib .parse .quote_plus (url )
42
- response = _get_source ("https://www.google.co.uk/search?q=site%3A" + query )
49
+ try :
50
+ query = urllib .parse .quote_plus (url )
51
+ response = _get_source ("https://www.google.co.uk/search?q=site%3A" + query )
43
52
44
- return response
53
+ return response
54
+ except requests .exceptions .RequestException as e :
55
+ print (e )
45
56
46
57
47
58
def _parse_site_results (response : str ):
@@ -54,9 +65,22 @@ def _parse_site_results(response: str):
54
65
indexed: Number of pages "indexed".
55
66
"""
56
67
57
- string = response .html .find ("#result-stats" , first = True ).text
58
- indexed = int (string .split (' ' )[1 ].replace (',' , '' ))
59
- return indexed
68
+ try :
69
+ if response .html .find ("#result-stats" , first = True ):
70
+
71
+ string = response .html .find ("#result-stats" , first = True ).text
72
+ if string :
73
+ # Remove values in paretheses, i.e. (0.31 seconds)
74
+ string = re .sub (r'\([^)]*\)' , '' , string )
75
+
76
+ # Remove non-numeric characters
77
+ string = re .sub ('[^0-9]' , '' , string )
78
+
79
+ return string
80
+ else :
81
+ return 0
82
+ except requests .exceptions .RequestException as e :
83
+ print (e )
60
84
61
85
62
86
def _count_indexed_pages (url : str ):
@@ -119,31 +143,73 @@ def _get_next_page(response, domain="google.co.uk"):
119
143
120
144
121
145
def _parse_search_results (response ):
122
- css_identifier_result = ".tF2Cxc"
123
- css_identifier_title = "h3"
124
- css_identifier_link = ".yuRUbf a"
125
- css_identifier_text = ".IsZvec"
146
+ """Parses the Google Search engine results and returns a list of results.
147
+
148
+ Note: This function is obviously dependent upon the source code in the Google results.
149
+ Google obfuscates the source of the page to make it more difficult to extra information.
150
+ Extraction classes change from time to time, so there is always a likelihood that this
151
+ function will need to be adjusted with the new class or identifier details.
152
+ In the event of the function failing, please raise a GitHub issue.
126
153
127
- results = response .html .find (css_identifier_result )
154
+ Args:
155
+ response: Response object containing the page source code.
156
+
157
+ Returns:
158
+ list: List of Google search results.
159
+ """
160
+
161
+ css_identifier_result = ".tF2Cxc" # The class of the div containing each result, i.e. <div class="tF2Cxc">
162
+ css_identifier_title = "h3" # The element containing the title, i.e. <h3 class="...
163
+ css_identifier_link = ".yuRUbf a" # The class of the div containing the anchor, i.e. <div class="yuRUbf"><a ...
164
+ css_identifier_text = ".VwiC3b" # The class of the parent element containing the snippet <span>
165
+ css_identifier_bold = ".VwiC3b span em" # The class of the element containing the snippet <span><em>
166
+
167
+ try :
168
+ results = response .html .find (css_identifier_result )
128
169
129
- output = []
170
+ output = []
130
171
131
- for result in results :
132
- item = {
133
- 'title' : result .find (css_identifier_title , first = True ).text ,
134
- 'link' : result .find (css_identifier_link , first = True ).attrs ['href' ],
135
- 'text' : result .find (css_identifier_text , first = True ).text
136
- }
172
+ for result in results :
137
173
138
- output .append (item )
174
+ if result .find (css_identifier_text , first = True ):
175
+ text = result .find (css_identifier_text , first = True ).text
176
+ else :
177
+ text = ''
139
178
140
- return output
179
+ if result .find (css_identifier_title , first = True ):
180
+ title = result .find (css_identifier_title , first = True ).text
181
+ else :
182
+ title = ''
183
+
184
+ if result .find (css_identifier_link , first = True ):
185
+ link = result .find (css_identifier_link , first = True ).attrs ['href' ]
186
+ else :
187
+ link = ''
188
+
189
+ # Extract bold text
190
+ if result .find (css_identifier_bold , first = True ):
191
+ bold = result .find (css_identifier_bold , first = True ).text .lower ()
192
+ else :
193
+ bold = ''
194
+
195
+ item = {
196
+ 'title' : title ,
197
+ 'link' : link ,
198
+ 'text' : text ,
199
+ 'bold' : bold ,
200
+ }
201
+
202
+ output .append (item )
203
+
204
+ return output
205
+ except requests .exceptions .RequestException as e :
206
+ print (e )
141
207
142
208
143
209
def get_serps (query : str ,
144
- output = "dataframe" ,
145
- pages = 1 ,
146
- domain = "google.co.uk" ):
210
+ output = "dataframe" ,
211
+ pages = 1 ,
212
+ domain = "google.co.uk" ):
147
213
"""Return the Google search results for a given query.
148
214
149
215
Args:
@@ -175,4 +241,4 @@ def get_serps(query: str,
175
241
df .index .names = ['position' ]
176
242
return df .reset_index ()
177
243
else :
178
- return results
244
+ return results
0 commit comments