Merge branch 'pre/beta' of https://github.com/ScrapeGraphAI/Scrapegraph-ai into pre/beta

VinciGit00 · VinciGit00 · commit a447aeae2aae · 2024-10-08T08:54:22.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,10 @@
+## [1.26.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.0-beta.7...v1.26.0-beta.8) (2024-10-08)
+
+
+### Features
+
+* undected_chromedriver support ([80ece21](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/80ece2179ac47a7ea42fbae4b61504a49ca18daa))
+
 ## [1.26.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.0-beta.6...v1.26.0-beta.7) (2024-10-07)
 
 
diff --git a/examples/extras/undected_playwrigth.py b/examples/extras/undected_playwrigth.py
@@ -0,0 +1,47 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+groq_key = os.getenv("GROQ_APIKEY")
+
+graph_config = {
+    "llm": {
+        "model": "groq/gemma-7b-it",
+        "api_key": groq_key,
+        "temperature": 0
+    },
+    "headless": False,
+    "backend": "undetected_chromedriver"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="List me all the projects with their description.",
+    # also accepts a string with the already downloaded HTML code
+    source="https://perinim.github.io/projects/",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/examples/groq/smart_scraper_groq.py b/examples/groq/smart_scraper_groq.py
@@ -20,7 +20,7 @@
         "api_key": groq_key,
         "temperature": 0
     },
-    "headless": False
+    "headless": False,
 }
 
 # ************************************************
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "scrapegraphai"
 
-version = "1.26.0b7"
+version = "1.26.0b8"
 
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
 authors = [
diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py
@@ -57,6 +57,28 @@ def __init__(
         self.urls = urls
         self.load_state = load_state
 
+    async def ascrape_undetected_chromedriver(self, url: str) -> str:
+        """
+        Asynchronously scrape the content of a given URL using undetected chrome with Selenium.
+
+        Args:
+            url (str): The URL to scrape.
+
+        Returns:
+            str: The scraped HTML content or an error message if an exception occurs.
+
+        """
+        import undetected_chromedriver as uc
+
+        logger.info(f"Starting scraping with {self.backend}...")
+        results = ""
+        try:
+            driver = uc.Chrome(headless=self.headless)
+            results = driver.get(url).page_content
+        except Exception as e:
+            results = f"Error: {e}"
+        return results
+
     async def ascrape_playwright(self, url: str) -> str:
         """
         Asynchronously scrape the content of a given URL using Playwright's async API.
@@ -71,7 +93,7 @@ async def ascrape_playwright(self, url: str) -> str:
         from playwright.async_api import async_playwright
         from undetected_playwright import Malenia
 
-        logger.info("Starting scraping...")
+        logger.info(f"Starting scraping with {self.backend}...")
         results = ""
         async with async_playwright() as p:
             browser = await p.chromium.launch(

Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@`
`20`	`20`	`"api_key": groq_key,`
`21`	`21`	`"temperature": 0`
`22`	`22`	`},`
`23`		`- "headless": False`
	`23`	`+ "headless": False,`
`24`	`24`	`}`
`25`	`25`
`26`	`26`	`# ************************************************`