-
-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
Copy pathcleanup_html.py
174 lines (132 loc) · 5.26 KB
/
cleanup_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""
Module for minimizing the code
"""
import json
import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup, Comment
from minify_html import minify
def extract_from_script_tags(soup):
script_content = []
for script in soup.find_all("script"):
content = script.string
if content:
try:
json_pattern = r"(?:const|let|var)?\s*\w+\s*=\s*({[\s\S]*?});?$"
json_matches = re.findall(json_pattern, content)
for potential_json in json_matches:
try:
parsed = json.loads(potential_json)
if parsed:
script_content.append(
f"JSON data from script: {json.dumps(parsed, indent=2)}"
)
except json.JSONDecodeError:
pass
if "window." in content or "document." in content:
data_pattern = r"(?:window|document)\.(\w+)\s*=\s*([^;]+);"
data_matches = re.findall(data_pattern, content)
for var_name, var_value in data_matches:
script_content.append(
f"Dynamic data - {var_name}: {var_value.strip()}"
)
except Exception:
if len(content) < 1000:
script_content.append(f"Script content: {content.strip()}")
return "\n\n".join(script_content)
def cleanup_html(html_content: str, base_url: str) -> str:
"""
Processes HTML content by removing unnecessary tags,
minifying the HTML, and extracting the title and body content.
Args:
html_content (str): The HTML content to be processed.
Returns:
str: A string combining the parsed title and the minified body content.
If no body content is found, it indicates so.
Example:
>>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
>>> remover(html_content)
'Title: Example, Body: <body><p>Hello World!</p></body>'
This function is particularly useful for preparing HTML content for
environments where bandwidth usage needs to be minimized.
"""
soup = BeautifulSoup(html_content, "html.parser")
title_tag = soup.find("title")
title = title_tag.get_text() if title_tag else ""
script_content = extract_from_script_tags(soup)
for tag in soup.find_all("style"):
tag.extract()
link_urls = [
urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)
]
images = soup.find_all("img")
image_urls = []
for image in images:
if "src" in image.attrs:
if "http" not in image["src"]:
image_urls.append(urljoin(base_url, image["src"]))
else:
image_urls.append(image["src"])
body_content = soup.find("body")
if body_content:
minimized_body = minify(str(body_content))
return title, minimized_body, link_urls, image_urls, script_content
else:
raise ValueError(
f"""No HTML body content found, please try setting the 'headless'
flag to False in the graph configuration. HTML content: {html_content}"""
)
def minify_html(html):
"""
minify_html function
"""
# Combine multiple regex operations into one for better performance
patterns = [
(r"<!--.*?-->", "", re.DOTALL),
(r">\s+<", "><", 0),
(r"\s+>", ">", 0),
(r"<\s+", "<", 0),
(r"\s+", " ", 0),
(r"\s*=\s*", "=", 0),
]
for pattern, repl, flags in patterns:
html = re.sub(pattern, repl, html, flags=flags)
return html.strip()
def reduce_html(html, reduction):
"""
Reduces the size of the HTML content based on the specified level of reduction.
Args:
html (str): The HTML content to reduce.
reduction (int): The level of reduction to apply to the HTML content.
0: minification only,
1: minification and removig unnecessary tags and attributes,
2: minification, removig unnecessary tags and attributes,
simplifying text content, removing of the head tag
Returns:
str: The reduced HTML content based on the specified reduction level.
"""
if reduction == 0:
return minify_html(html)
soup = BeautifulSoup(html, "html.parser")
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
for tag in soup(["style"]):
tag.string = ""
attrs_to_keep = ["class", "id", "href", "src", "type"]
for tag in soup.find_all(True):
for attr in list(tag.attrs):
if attr not in attrs_to_keep:
del tag[attr]
if reduction == 1:
return minify_html(str(soup))
for tag in soup(["style"]):
tag.decompose()
body = soup.body
if not body:
return "No <body> tag found in the HTML"
for tag in body.find_all(string=True):
if tag.parent.name not in ["script"]:
tag.replace_with(re.sub(r"\s+", " ", tag.strip())[:20])
reduced_html = str(body)
reduced_html = minify_html(reduced_html)
return reduced_html