-
-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
Copy pathsearch_link_graph.py
103 lines (83 loc) · 3.45 KB
/
search_link_graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""
SearchLinkGraph Module
"""
from typing import Optional, Type
from pydantic import BaseModel
from ..nodes import FetchNode, SearchLinkNode, SearchLinksWithContext
from .abstract_graph import AbstractGraph
from .base_graph import BaseGraph
class SearchLinkGraph(AbstractGraph):
"""
SearchLinkGraph is a scraping pipeline that automates the process of
extracting information from web pages using a natural language model
to interpret and answer prompts.
Attributes:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel): The schema for the graph output.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.
Args:
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
schema (BaseModel, optional): The schema for the graph output. Defaults to None.
"""
def __init__(
self, source: str, config: dict, schema: Optional[Type[BaseModel]] = None
):
super().__init__("", config, source, schema)
self.input_key = "url" if source.startswith("http") else "local_dir"
def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping.
Returns:
BaseGraph: A graph instance representing the web scraping workflow.
"""
fetch_node = FetchNode(
input="url| local_dir",
output=["doc"],
node_config={
"force": self.config.get("force", False),
"cut": self.config.get("cut", True),
"loader_kwargs": self.config.get("loader_kwargs", {}),
"storage_state": self.config.get("storage_state"),
},
)
if self.config.get("llm_style") == (True, None):
search_link_node = SearchLinksWithContext(
input="doc",
output=["parsed_doc"],
node_config={
"llm_model": self.llm_model,
"chunk_size": self.model_token,
},
)
else:
search_link_node = SearchLinkNode(
input="doc",
output=["parsed_doc"],
node_config={
"chunk_size": self.model_token,
"filter_links": True,
"filter_config": self.config.get("filter_config", {}),
},
)
return BaseGraph(
nodes=[fetch_node, search_link_node],
edges=[(fetch_node, search_link_node)],
entry_point=fetch_node,
graph_name=self.__class__.__name__,
)
def run(self) -> str:
"""
Executes the scraping process and returns the answer to the prompt.
Returns:
str: The answer to the prompt.
"""
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)
return self.final_state.get("parsed_doc", "No answer found.")