File tree 5 files changed +79
-3
lines changed
5 files changed +79
-3
lines changed Original file line number Diff line number Diff line change
1
+ ## [ 1.26.0-beta.8] ( https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.0-beta.7...v1.26.0-beta.8 ) (2024-10-08)
2
+
3
+
4
+ ### Features
5
+
6
+ * undected_chromedriver support ([ 80ece21] ( https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/80ece2179ac47a7ea42fbae4b61504a49ca18daa ) )
7
+
1
8
## [ 1.26.0-beta.7] ( https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.0-beta.6...v1.26.0-beta.7 ) (2024-10-07)
2
9
3
10
Original file line number Diff line number Diff line change
1
+ """
2
+ Basic example of scraping pipeline using SmartScraper
3
+ """
4
+
5
+ import os
6
+ from dotenv import load_dotenv
7
+ from scrapegraphai .graphs import SmartScraperGraph
8
+ from scrapegraphai .utils import prettify_exec_info
9
+
10
+ load_dotenv ()
11
+
12
+ # ************************************************
13
+ # Define the configuration for the graph
14
+ # ************************************************
15
+
16
+ groq_key = os .getenv ("GROQ_APIKEY" )
17
+
18
+ graph_config = {
19
+ "llm" : {
20
+ "model" : "groq/gemma-7b-it" ,
21
+ "api_key" : groq_key ,
22
+ "temperature" : 0
23
+ },
24
+ "headless" : False ,
25
+ "backend" : "undetected_chromedriver"
26
+ }
27
+
28
+ # ************************************************
29
+ # Create the SmartScraperGraph instance and run it
30
+ # ************************************************
31
+
32
+ smart_scraper_graph = SmartScraperGraph (
33
+ prompt = "List me all the projects with their description." ,
34
+ # also accepts a string with the already downloaded HTML code
35
+ source = "https://perinim.github.io/projects/" ,
36
+ config = graph_config
37
+ )
38
+
39
+ result = smart_scraper_graph .run ()
40
+ print (result )
41
+
42
+ # ************************************************
43
+ # Get graph execution info
44
+ # ************************************************
45
+
46
+ graph_exec_info = smart_scraper_graph .get_execution_info ()
47
+ print (prettify_exec_info (graph_exec_info ))
Original file line number Diff line number Diff line change 20
20
"api_key" : groq_key ,
21
21
"temperature" : 0
22
22
},
23
- "headless" : False
23
+ "headless" : False ,
24
24
}
25
25
26
26
# ************************************************
Original file line number Diff line number Diff line change 1
1
[project ]
2
2
name = " scrapegraphai"
3
3
4
- version = " 1.26.0b7 "
4
+ version = " 1.26.0b8 "
5
5
6
6
description = " A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
7
7
authors = [
Original file line number Diff line number Diff line change @@ -57,6 +57,28 @@ def __init__(
57
57
self .urls = urls
58
58
self .load_state = load_state
59
59
60
+ async def ascrape_undetected_chromedriver (self , url : str ) -> str :
61
+ """
62
+ Asynchronously scrape the content of a given URL using undetected chrome with Selenium.
63
+
64
+ Args:
65
+ url (str): The URL to scrape.
66
+
67
+ Returns:
68
+ str: The scraped HTML content or an error message if an exception occurs.
69
+
70
+ """
71
+ import undetected_chromedriver as uc
72
+
73
+ logger .info (f"Starting scraping with { self .backend } ..." )
74
+ results = ""
75
+ try :
76
+ driver = uc .Chrome (headless = self .headless )
77
+ results = driver .get (url ).page_content
78
+ except Exception as e :
79
+ results = f"Error: { e } "
80
+ return results
81
+
60
82
async def ascrape_playwright (self , url : str ) -> str :
61
83
"""
62
84
Asynchronously scrape the content of a given URL using Playwright's async API.
@@ -71,7 +93,7 @@ async def ascrape_playwright(self, url: str) -> str:
71
93
from playwright .async_api import async_playwright
72
94
from undetected_playwright import Malenia
73
95
74
- logger .info ("Starting scraping..." )
96
+ logger .info (f "Starting scraping with { self . backend } ..." )
75
97
results = ""
76
98
async with async_playwright () as p :
77
99
browser = await p .chromium .launch (
You can’t perform that action at this time.
0 commit comments