@@ -160,20 +160,42 @@ def extract_links(self, html_content: str) -> list:
160
160
def get_full_links (self , base_url : str , links : list ) -> list :
161
161
"""
162
162
Converts relative URLs to full URLs based on the base URL.
163
+ Filters out non-web links (mailto:, tel:, javascript:, etc.).
163
164
164
165
Args:
165
166
base_url (str): The base URL for resolving relative links.
166
167
links (list): A list of links to convert.
167
168
168
169
Returns:
169
- list: A list of full URLs.
170
+ list: A list of valid full URLs.
170
171
"""
172
+ # List of invalid URL schemes to filter out
173
+ invalid_schemes = {
174
+ 'mailto:' , 'tel:' , 'fax:' , 'sms:' , 'callto:' , 'wtai:' , 'javascript:' ,
175
+ 'data:' , 'file:' , 'ftp:' , 'irc:' , 'news:' , 'nntp:' , 'feed:' , 'webcal:' ,
176
+ 'skype:' , 'im:' , 'mtps:' , 'spotify:' , 'steam:' , 'teamspeak:' , 'udp:' ,
177
+ 'unreal:' , 'ut2004:' , 'ventrilo:' , 'view-source:' , 'ws:' , 'wss:'
178
+ }
179
+
171
180
full_links = []
172
181
for link in links :
173
- if self .only_inside_links and link .startswith ("http" ):
182
+ # Skip if link starts with any invalid scheme
183
+ if any (link .lower ().startswith (scheme ) for scheme in invalid_schemes ):
174
184
continue
175
- full_link = link if link .startswith ("http" ) else urljoin (base_url , link )
176
- full_links .append (full_link )
185
+
186
+ # Skip if it's an external link and only_inside_links is True
187
+ if self .only_inside_links and link .startswith (('http://' , 'https://' )):
188
+ continue
189
+
190
+ # Convert relative URLs to absolute URLs
191
+ try :
192
+ full_link = link if link .startswith (('http://' , 'https://' )) else urljoin (base_url , link )
193
+ # Ensure the final URL starts with http:// or https://
194
+ if full_link .startswith (('http://' , 'https://' )):
195
+ full_links .append (full_link )
196
+ except Exception as e :
197
+ self .logger .warning (f"Failed to process link { link } : { str (e )} " )
198
+
177
199
return full_links
178
200
179
201
def obtain_content (self , documents : List , loader_kwargs ) -> List :
@@ -191,7 +213,11 @@ def obtain_content(self, documents: List, loader_kwargs) -> List:
191
213
for doc in documents :
192
214
source = doc ["source" ]
193
215
if "document" not in doc :
194
- document = self .fetch_content (source , loader_kwargs )
216
+ try :
217
+ document = self .fetch_content (source , loader_kwargs )
218
+ except Exception as e :
219
+ self .logger .warning (f"Failed to fetch content for { source } : { str (e )} " )
220
+ continue
195
221
196
222
if not document or not document [0 ].page_content .strip ():
197
223
self .logger .warning (f"Failed to fetch content for { source } " )
0 commit comments