Skip to content

Commit 0e81549

Browse files
Merge pull request #23 from practical-data-science/21-get-indexed-pages-fix
21 get indexed pages fix
2 parents 262d5e5 + fe7d9bd commit 0e81549

File tree

3 files changed

+112
-30
lines changed

3 files changed

+112
-30
lines changed

ecommercetools/seo/google_search.py

+91-25
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
General functions for scraping data from Google search engine results pages.
33
"""
44

5+
import re
56
import requests
67
import urllib.parse
78
import pandas as pd
@@ -22,8 +23,15 @@ def _get_source(url: str):
2223
try:
2324
session = HTMLSession()
2425
response = session.get(url)
25-
return response
2626

27+
if response.status_code == 200:
28+
return response
29+
elif response.status_code == 429:
30+
print('Error: Too many requests. Google has temporarily blocked you. Try again later.')
31+
exit()
32+
else:
33+
print('Error:' + response)
34+
exit()
2735
except requests.exceptions.RequestException as e:
2836
print(e)
2937

@@ -38,10 +46,13 @@ def _get_site_results(url: str):
3846
response (str): HTML of page.
3947
"""
4048

41-
query = urllib.parse.quote_plus(url)
42-
response = _get_source("https://www.google.co.uk/search?q=site%3A" + query)
49+
try:
50+
query = urllib.parse.quote_plus(url)
51+
response = _get_source("https://www.google.co.uk/search?q=site%3A" + query)
4352

44-
return response
53+
return response
54+
except requests.exceptions.RequestException as e:
55+
print(e)
4556

4657

4758
def _parse_site_results(response: str):
@@ -54,9 +65,22 @@ def _parse_site_results(response: str):
5465
indexed: Number of pages "indexed".
5566
"""
5667

57-
string = response.html.find("#result-stats", first=True).text
58-
indexed = int(string.split(' ')[1].replace(',', ''))
59-
return indexed
68+
try:
69+
if response.html.find("#result-stats", first=True):
70+
71+
string = response.html.find("#result-stats", first=True).text
72+
if string:
73+
# Remove values in paretheses, i.e. (0.31 seconds)
74+
string = re.sub(r'\([^)]*\)', '', string)
75+
76+
# Remove non-numeric characters
77+
string = re.sub('[^0-9]', '', string)
78+
79+
return string
80+
else:
81+
return 0
82+
except requests.exceptions.RequestException as e:
83+
print(e)
6084

6185

6286
def _count_indexed_pages(url: str):
@@ -119,31 +143,73 @@ def _get_next_page(response, domain="google.co.uk"):
119143

120144

121145
def _parse_search_results(response):
122-
css_identifier_result = ".tF2Cxc"
123-
css_identifier_title = "h3"
124-
css_identifier_link = ".yuRUbf a"
125-
css_identifier_text = ".IsZvec"
146+
"""Parses the Google Search engine results and returns a list of results.
147+
148+
Note: This function is obviously dependent upon the source code in the Google results.
149+
Google obfuscates the source of the page to make it more difficult to extra information.
150+
Extraction classes change from time to time, so there is always a likelihood that this
151+
function will need to be adjusted with the new class or identifier details.
152+
In the event of the function failing, please raise a GitHub issue.
126153
127-
results = response.html.find(css_identifier_result)
154+
Args:
155+
response: Response object containing the page source code.
156+
157+
Returns:
158+
list: List of Google search results.
159+
"""
160+
161+
css_identifier_result = ".tF2Cxc" # The class of the div containing each result, i.e. <div class="tF2Cxc">
162+
css_identifier_title = "h3" # The element containing the title, i.e. <h3 class="...
163+
css_identifier_link = ".yuRUbf a" # The class of the div containing the anchor, i.e. <div class="yuRUbf"><a ...
164+
css_identifier_text = ".VwiC3b" # The class of the parent element containing the snippet <span>
165+
css_identifier_bold = ".VwiC3b span em" # The class of the element containing the snippet <span><em>
166+
167+
try:
168+
results = response.html.find(css_identifier_result)
128169

129-
output = []
170+
output = []
130171

131-
for result in results:
132-
item = {
133-
'title': result.find(css_identifier_title, first=True).text,
134-
'link': result.find(css_identifier_link, first=True).attrs['href'],
135-
'text': result.find(css_identifier_text, first=True).text
136-
}
172+
for result in results:
137173

138-
output.append(item)
174+
if result.find(css_identifier_text, first=True):
175+
text = result.find(css_identifier_text, first=True).text
176+
else:
177+
text = ''
139178

140-
return output
179+
if result.find(css_identifier_title, first=True):
180+
title = result.find(css_identifier_title, first=True).text
181+
else:
182+
title = ''
183+
184+
if result.find(css_identifier_link, first=True):
185+
link = result.find(css_identifier_link, first=True).attrs['href']
186+
else:
187+
link = ''
188+
189+
# Extract bold text
190+
if result.find(css_identifier_bold, first=True):
191+
bold = result.find(css_identifier_bold, first=True).text.lower()
192+
else:
193+
bold = ''
194+
195+
item = {
196+
'title': title,
197+
'link': link,
198+
'text': text,
199+
'bold': bold,
200+
}
201+
202+
output.append(item)
203+
204+
return output
205+
except requests.exceptions.RequestException as e:
206+
print(e)
141207

142208

143209
def get_serps(query: str,
144-
output="dataframe",
145-
pages=1,
146-
domain="google.co.uk"):
210+
output="dataframe",
211+
pages=1,
212+
domain="google.co.uk"):
147213
"""Return the Google search results for a given query.
148214
149215
Args:
@@ -175,4 +241,4 @@ def get_serps(query: str,
175241
df.index.names = ['position']
176242
return df.reset_index()
177243
else:
178-
return results
244+
return results

example.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
from ecommercetools import seo
88
from ecommercetools import reports
99

10+
"""
11+
12+
1013
# =======================================================================
1114
# Load sample data
1215
# =======================================================================
@@ -121,12 +124,25 @@
121124
122125
print(df_pages)
123126
127+
124128
# =======================================================================
125129
# Get SERPs
126130
# =======================================================================
127131
128-
results = seo.get_serps("data science courses", pages=5)
132+
results = seo.get_serps("bearded dragon brumation", pages=3)
129133
print(results)
134+
"""
135+
136+
# =======================================================================
137+
# Get indexed pages
138+
# =======================================================================
139+
140+
results = seo.get_indexed_pages(["https://www.bbc.co.uk", # Millions
141+
"https://www.practicaldatascience.co.uk", # 1
142+
"https://www.shj989uiskjdlksjd.com" # None
143+
])
144+
print(results)
145+
exit()
130146

131147
# =======================================================================
132148
# Get all Google Search Console data
@@ -152,7 +168,7 @@
152168
payload_before = {
153169
'startDate': "2021-08-11",
154170
'endDate': "2021-08-31",
155-
'dimensions': ["page","query"],
171+
'dimensions': ["page", "query"],
156172
}
157173

158174
payload_after = {

setup.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,17 @@
99
setup(
1010
name='ecommercetools',
1111
packages=find_namespace_packages(include=['ecommercetools.*']),
12-
version='0.42',
12+
version='0.42.2',
1313
license='MIT',
14-
description='EcommerceTools is a data science toolkit for ecommerce, marketing science, and technical SEO.',
14+
description='EcommerceTools is a data science toolkit for ecommerce, marketing science, and Python SEO.',
1515
long_description=long_description,
1616
long_description_content_type='text/markdown',
1717
author='Matt Clarke',
1818
author_email='matt@practicaldatascience.co.uk',
1919
url='https://github.com/practical-data-science/ecommercetools',
2020
download_url='https://github.com/practical-data-science/ecommercetools/archive/master.zip',
2121
keywords=['ecommerce', 'marketing', 'seo', 'seo testing', 'customers', 'products', 'rfm', 'abc',
22-
'operations', 'analytics', 'python', 'pandas', 'nlp', 'causal impact'],
22+
'operations', 'analytics', 'python', 'python seo', 'pandas', 'nlp', 'causal impact'],
2323
classifiers=[
2424
'Development Status :: 3 - Alpha',
2525
'Intended Audience :: Developers',

0 commit comments

Comments
 (0)