8
8
9
9
import re
10
10
from html .parser import HTMLParser
11
+ from typing import Optional
11
12
from urllib import parse
12
13
13
14
import requests
14
15
15
16
16
17
class Parser (HTMLParser ):
17
- def __init__ (self , domain : str ):
18
- HTMLParser .__init__ (self )
19
- self .data = []
18
+ def __init__ (self , domain : str ) -> None :
19
+ super () .__init__ ()
20
+ self .urls : list [ str ] = []
20
21
self .domain = domain
21
22
22
- def handle_starttag (self , tag : str , attrs : str ) -> None :
23
+ def handle_starttag (self , tag : str , attrs : list [ tuple [ str , Optional [ str ]]] ) -> None :
23
24
"""
24
25
This function parse html to take takes url from tags
25
26
"""
@@ -29,10 +30,10 @@ def handle_starttag(self, tag: str, attrs: str) -> None:
29
30
for name , value in attrs :
30
31
# If href is defined, and not empty nor # print it.
31
32
if name == "href" and value != "#" and value != "" :
32
- # If not already in data .
33
- if value not in self .data :
33
+ # If not already in urls .
34
+ if value not in self .urls :
34
35
url = parse .urljoin (self .domain , value )
35
- self .data .append (url )
36
+ self .urls .append (url )
36
37
37
38
38
39
# Get main domain name (example.com)
@@ -59,7 +60,7 @@ def get_sub_domain_name(url: str) -> str:
59
60
return parse .urlparse (url ).netloc
60
61
61
62
62
- def emails_from_url (url : str = "https://github.com" ) -> list :
63
+ def emails_from_url (url : str = "https://github.com" ) -> list [ str ] :
63
64
"""
64
65
This function takes url and return all valid urls
65
66
"""
@@ -78,7 +79,7 @@ def emails_from_url(url: str = "https://github.com") -> list:
78
79
79
80
# Get links and loop through
80
81
valid_emails = set ()
81
- for link in parser .data :
82
+ for link in parser .urls :
82
83
# open URL.
83
84
# read = requests.get(link)
84
85
try :
0 commit comments