Merge pull request #23 from practical-data-science/21-get-indexed-pages-fix

practical-data-science · web-flow · commit 0e8154968d45 · 2022-01-22T15:45:44.000Z
21 get indexed pages fix
diff --git a/ecommercetools/seo/google_search.py b/ecommercetools/seo/google_search.py
@@ -2,6 +2,7 @@
 General functions for scraping data from Google search engine results pages.
 """
 
+import re
 import requests
 import urllib.parse
 import pandas as pd
@@ -22,8 +23,15 @@ def _get_source(url: str):
     try:
         session = HTMLSession()
         response = session.get(url)
-        return response
 
+        if response.status_code == 200:
+            return response
+        elif response.status_code == 429:
+            print('Error: Too many requests. Google has temporarily blocked you. Try again later.')
+            exit()
+        else:
+            print('Error:' + response)
+            exit()
     except requests.exceptions.RequestException as e:
         print(e)
 
@@ -38,10 +46,13 @@ def _get_site_results(url: str):
         response (str): HTML of page.
     """
 
-    query = urllib.parse.quote_plus(url)
-    response = _get_source("https://www.google.co.uk/search?q=site%3A" + query)
+    try:
+        query = urllib.parse.quote_plus(url)
+        response = _get_source("https://www.google.co.uk/search?q=site%3A" + query)
 
-    return response
+        return response
+    except requests.exceptions.RequestException as e:
+        print(e)
 
 
 def _parse_site_results(response: str):
@@ -54,9 +65,22 @@ def _parse_site_results(response: str):
         indexed: Number of pages "indexed".
     """
 
-    string = response.html.find("#result-stats", first=True).text
-    indexed = int(string.split(' ')[1].replace(',', ''))
-    return indexed
+    try:
+        if response.html.find("#result-stats", first=True):
+
+            string = response.html.find("#result-stats", first=True).text
+            if string:
+                # Remove values in paretheses, i.e. (0.31 seconds)
+                string = re.sub(r'\([^)]*\)', '', string)
+
+                # Remove non-numeric characters
+                string = re.sub('[^0-9]', '', string)
+
+                return string
+            else:
+                return 0
+    except requests.exceptions.RequestException as e:
+        print(e)
 
 
 def _count_indexed_pages(url: str):
@@ -119,31 +143,73 @@ def _get_next_page(response, domain="google.co.uk"):
 
 
 def _parse_search_results(response):
-    css_identifier_result = ".tF2Cxc"
-    css_identifier_title = "h3"
-    css_identifier_link = ".yuRUbf a"
-    css_identifier_text = ".IsZvec"
+    """Parses the Google Search engine results and returns a list of results.
+
+    Note: This function is obviously dependent upon the source code in the Google results.
+    Google obfuscates the source of the page to make it more difficult to extra information.
+    Extraction classes change from time to time, so there is always a likelihood that this
+    function will need to be adjusted with the new class or identifier details.
+    In the event of the function failing, please raise a GitHub issue.
 
-    results = response.html.find(css_identifier_result)
+    Args:
+        response: Response object containing the page source code.
+
+    Returns:
+        list: List of Google search results.
+    """
+
+    css_identifier_result = ".tF2Cxc"  # The class of the div containing each result, i.e. <div class="tF2Cxc">
+    css_identifier_title = "h3"  # The element containing the title, i.e. <h3 class="...
+    css_identifier_link = ".yuRUbf a"  # The class of the div containing the anchor, i.e. <div class="yuRUbf"><a ...
+    css_identifier_text = ".VwiC3b"  # The class of the parent element containing the snippet <span>
+    css_identifier_bold = ".VwiC3b span em"  # The class of the element containing the snippet <span><em>
+
+    try:
+        results = response.html.find(css_identifier_result)
 
-    output = []
+        output = []
 
-    for result in results:
-        item = {
-            'title': result.find(css_identifier_title, first=True).text,
-            'link': result.find(css_identifier_link, first=True).attrs['href'],
-            'text': result.find(css_identifier_text, first=True).text
-        }
+        for result in results:
 
-        output.append(item)
+            if result.find(css_identifier_text, first=True):
+                text = result.find(css_identifier_text, first=True).text
+            else:
+                text = ''
 
-    return output
+            if result.find(css_identifier_title, first=True):
+                title = result.find(css_identifier_title, first=True).text
+            else:
+                title = ''
+
+            if result.find(css_identifier_link, first=True):
+                link = result.find(css_identifier_link, first=True).attrs['href']
+            else:
+                link = ''
+
+            # Extract bold text
+            if result.find(css_identifier_bold, first=True):
+                bold = result.find(css_identifier_bold, first=True).text.lower()
+            else:
+                bold = ''
+
+            item = {
+                'title': title,
+                'link': link,
+                'text': text,
+                'bold': bold,
+            }
+
+            output.append(item)
+
+        return output
+    except requests.exceptions.RequestException as e:
+        print(e)
 
 
 def get_serps(query: str,
-                        output="dataframe",
-                        pages=1,
-                        domain="google.co.uk"):
+              output="dataframe",
+              pages=1,
+              domain="google.co.uk"):
     """Return the Google search results for a given query.
 
     Args:
@@ -175,4 +241,4 @@ def get_serps(query: str,
             df.index.names = ['position']
             return df.reset_index()
         else:
-            return results
+            return results
diff --git a/example.py b/example.py
@@ -7,6 +7,9 @@
 from ecommercetools import seo
 from ecommercetools import reports
 
+"""
+
+
 # =======================================================================
 # Load sample data
 # =======================================================================
@@ -121,12 +124,25 @@
 
 print(df_pages)
 
+
 # =======================================================================
 # Get SERPs
 # =======================================================================
 
-results = seo.get_serps("data science courses", pages=5)
+results = seo.get_serps("bearded dragon brumation", pages=3)
 print(results)
+"""
+
+# =======================================================================
+# Get indexed pages
+# =======================================================================
+
+results = seo.get_indexed_pages(["https://www.bbc.co.uk",  # Millions
+                                 "https://www.practicaldatascience.co.uk",  # 1
+                                 "https://www.shj989uiskjdlksjd.com"  # None
+                                 ])
+print(results)
+exit()
 
 # =======================================================================
 # Get all Google Search Console data
@@ -152,7 +168,7 @@
 payload_before = {
     'startDate': "2021-08-11",
     'endDate': "2021-08-31",
-    'dimensions': ["page","query"],
+    'dimensions': ["page", "query"],
 }
 
 payload_after = {
diff --git a/setup.py b/setup.py
@@ -9,17 +9,17 @@
 setup(
     name='ecommercetools',
     packages=find_namespace_packages(include=['ecommercetools.*']),
-    version='0.42',
+    version='0.42.2',
     license='MIT',
-    description='EcommerceTools is a data science toolkit for ecommerce, marketing science, and technical SEO.',
+    description='EcommerceTools is a data science toolkit for ecommerce, marketing science, and Python SEO.',
     long_description=long_description,
     long_description_content_type='text/markdown',
     author='Matt Clarke',
     author_email='matt@practicaldatascience.co.uk',
     url='https://github.com/practical-data-science/ecommercetools',
     download_url='https://github.com/practical-data-science/ecommercetools/archive/master.zip',
     keywords=['ecommerce', 'marketing', 'seo', 'seo testing', 'customers', 'products', 'rfm', 'abc',
-              'operations', 'analytics', 'python', 'pandas', 'nlp', 'causal impact'],
+              'operations', 'analytics', 'python', 'python seo', 'pandas', 'nlp', 'causal impact'],
     classifiers=[
         'Development Status :: 3 - Alpha',
         'Intended Audience :: Developers',