Skip to content

Commit 8d9c909

Browse files
committed
fix: disallow mailto: (#861)
1 parent 5be7c49 commit 8d9c909

File tree

1 file changed

+31
-5
lines changed

1 file changed

+31
-5
lines changed

scrapegraphai/nodes/fetch_node_level_k.py

+31-5
Original file line numberDiff line numberDiff line change
@@ -160,20 +160,42 @@ def extract_links(self, html_content: str) -> list:
160160
def get_full_links(self, base_url: str, links: list) -> list:
161161
"""
162162
Converts relative URLs to full URLs based on the base URL.
163+
Filters out non-web links (mailto:, tel:, javascript:, etc.).
163164
164165
Args:
165166
base_url (str): The base URL for resolving relative links.
166167
links (list): A list of links to convert.
167168
168169
Returns:
169-
list: A list of full URLs.
170+
list: A list of valid full URLs.
170171
"""
172+
# List of invalid URL schemes to filter out
173+
invalid_schemes = {
174+
'mailto:', 'tel:', 'fax:', 'sms:', 'callto:', 'wtai:', 'javascript:',
175+
'data:', 'file:', 'ftp:', 'irc:', 'news:', 'nntp:', 'feed:', 'webcal:',
176+
'skype:', 'im:', 'mtps:', 'spotify:', 'steam:', 'teamspeak:', 'udp:',
177+
'unreal:', 'ut2004:', 'ventrilo:', 'view-source:', 'ws:', 'wss:'
178+
}
179+
171180
full_links = []
172181
for link in links:
173-
if self.only_inside_links and link.startswith("http"):
182+
# Skip if link starts with any invalid scheme
183+
if any(link.lower().startswith(scheme) for scheme in invalid_schemes):
174184
continue
175-
full_link = link if link.startswith("http") else urljoin(base_url, link)
176-
full_links.append(full_link)
185+
186+
# Skip if it's an external link and only_inside_links is True
187+
if self.only_inside_links and link.startswith(('http://', 'https://')):
188+
continue
189+
190+
# Convert relative URLs to absolute URLs
191+
try:
192+
full_link = link if link.startswith(('http://', 'https://')) else urljoin(base_url, link)
193+
# Ensure the final URL starts with http:// or https://
194+
if full_link.startswith(('http://', 'https://')):
195+
full_links.append(full_link)
196+
except Exception as e:
197+
self.logger.warning(f"Failed to process link {link}: {str(e)}")
198+
177199
return full_links
178200

179201
def obtain_content(self, documents: List, loader_kwargs) -> List:
@@ -191,7 +213,11 @@ def obtain_content(self, documents: List, loader_kwargs) -> List:
191213
for doc in documents:
192214
source = doc["source"]
193215
if "document" not in doc:
194-
document = self.fetch_content(source, loader_kwargs)
216+
try:
217+
document = self.fetch_content(source, loader_kwargs)
218+
except Exception as e:
219+
self.logger.warning(f"Failed to fetch content for {source}: {str(e)}")
220+
continue
195221

196222
if not document or not document[0].page_content.strip():
197223
self.logger.warning(f"Failed to fetch content for {source}")

0 commit comments

Comments
 (0)