Skip to content

Commit a447aea

Browse files
committed
Merge branch 'pre/beta' of https://github.com/ScrapeGraphAI/Scrapegraph-ai into pre/beta
2 parents a200f59 + 38b795e commit a447aea

File tree

5 files changed

+79
-3
lines changed

5 files changed

+79
-3
lines changed

CHANGELOG.md

+7
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
## [1.26.0-beta.8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.0-beta.7...v1.26.0-beta.8) (2024-10-08)
2+
3+
4+
### Features
5+
6+
* undected_chromedriver support ([80ece21](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/80ece2179ac47a7ea42fbae4b61504a49ca18daa))
7+
18
## [1.26.0-beta.7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.0-beta.6...v1.26.0-beta.7) (2024-10-07)
29

310

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SmartScraperGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
16+
groq_key = os.getenv("GROQ_APIKEY")
17+
18+
graph_config = {
19+
"llm": {
20+
"model": "groq/gemma-7b-it",
21+
"api_key": groq_key,
22+
"temperature": 0
23+
},
24+
"headless": False,
25+
"backend": "undetected_chromedriver"
26+
}
27+
28+
# ************************************************
29+
# Create the SmartScraperGraph instance and run it
30+
# ************************************************
31+
32+
smart_scraper_graph = SmartScraperGraph(
33+
prompt="List me all the projects with their description.",
34+
# also accepts a string with the already downloaded HTML code
35+
source="https://perinim.github.io/projects/",
36+
config=graph_config
37+
)
38+
39+
result = smart_scraper_graph.run()
40+
print(result)
41+
42+
# ************************************************
43+
# Get graph execution info
44+
# ************************************************
45+
46+
graph_exec_info = smart_scraper_graph.get_execution_info()
47+
print(prettify_exec_info(graph_exec_info))

examples/groq/smart_scraper_groq.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
"api_key": groq_key,
2121
"temperature": 0
2222
},
23-
"headless": False
23+
"headless": False,
2424
}
2525

2626
# ************************************************

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[project]
22
name = "scrapegraphai"
33

4-
version = "1.26.0b7"
4+
version = "1.26.0b8"
55

66
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
77
authors = [

scrapegraphai/docloaders/chromium.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,28 @@ def __init__(
5757
self.urls = urls
5858
self.load_state = load_state
5959

60+
async def ascrape_undetected_chromedriver(self, url: str) -> str:
61+
"""
62+
Asynchronously scrape the content of a given URL using undetected chrome with Selenium.
63+
64+
Args:
65+
url (str): The URL to scrape.
66+
67+
Returns:
68+
str: The scraped HTML content or an error message if an exception occurs.
69+
70+
"""
71+
import undetected_chromedriver as uc
72+
73+
logger.info(f"Starting scraping with {self.backend}...")
74+
results = ""
75+
try:
76+
driver = uc.Chrome(headless=self.headless)
77+
results = driver.get(url).page_content
78+
except Exception as e:
79+
results = f"Error: {e}"
80+
return results
81+
6082
async def ascrape_playwright(self, url: str) -> str:
6183
"""
6284
Asynchronously scrape the content of a given URL using Playwright's async API.
@@ -71,7 +93,7 @@ async def ascrape_playwright(self, url: str) -> str:
7193
from playwright.async_api import async_playwright
7294
from undetected_playwright import Malenia
7395

74-
logger.info("Starting scraping...")
96+
logger.info(f"Starting scraping with {self.backend}...")
7597
results = ""
7698
async with async_playwright() as p:
7799
browser = await p.chromium.launch(

0 commit comments

Comments
 (0)