Skip to content

Commit c3a23b5

Browse files
authored
Merge pull request #18 from Zipstack/highlight_function
Highlight function
2 parents 561c746 + a67e42f commit c3a23b5

File tree

3 files changed

+86
-1
lines changed

3 files changed

+86
-1
lines changed

src/unstract/llmwhisperer/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "2.1.0"
1+
__version__ = "2.2.0"
22

33
from .client import LLMWhispererClient # noqa: F401
44
from .client_v2 import LLMWhispererClientV2 # noqa: F401

src/unstract/llmwhisperer/client_v2.py

+42
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,44 @@ def get_usage_info(self) -> dict:
155155
raise LLMWhispererClientException(err)
156156
return json.loads(response.text)
157157

158+
def get_highlight_data(self, whisper_hash: str, lines: str, extract_all_lines: bool = False) -> dict:
159+
"""Retrieves the highlight information of the LLMWhisperer API.
160+
161+
This method sends a GET request to the '/highlights' endpoint of the LLMWhisperer API.
162+
The response is a JSON object containing the usage information.
163+
Refer to https://docs.unstract.com/llm_whisperer/apis/llm_whisperer_usage_api
164+
165+
Args:
166+
whisper_hash (str): The hash of the whisper operation.
167+
lines (str): Define which lines metadata to retrieve.
168+
You can specify which lines metadata to retrieve with this parameter.
169+
Example 1-5,7,21- will retrieve lines metadata 1,2,3,4,5,7,21,22,23,24...
170+
till the last line meta data.
171+
Returns:
172+
dict: A dictionary containing the highlight information.
173+
174+
Raises:
175+
LLMWhispererClientException: If the API request fails, it raises an exception with
176+
the error message and status code returned by the API.
177+
"""
178+
self.logger.debug("highlight called")
179+
url = f"{self.base_url}/highlights"
180+
params = {
181+
"whisper_hash": whisper_hash,
182+
"lines": lines,
183+
"extract_all_lines": extract_all_lines,
184+
}
185+
self.logger.debug("url: %s", url)
186+
req = requests.Request("GET", url, headers=self.headers, params=params)
187+
prepared = req.prepare()
188+
s = requests.Session()
189+
response = s.send(prepared, timeout=self.api_timeout)
190+
if response.status_code != 200:
191+
err = json.loads(response.text)
192+
err["status_code"] = response.status_code
193+
raise LLMWhispererClientException(err)
194+
return json.loads(response.text)
195+
158196
def whisper(
159197
self,
160198
file_path: str = "",
@@ -171,6 +209,7 @@ def whisper(
171209
mark_vertical_lines: bool = False,
172210
mark_horizontal_lines: bool = False,
173211
line_spitter_strategy: str = "left-priority",
212+
add_line_nos: bool = False,
174213
lang="eng",
175214
tag="default",
176215
filename="",
@@ -201,6 +240,8 @@ def whisper(
201240
mark_vertical_lines (bool, optional): Whether to mark vertical lines. Defaults to False.
202241
mark_horizontal_lines (bool, optional): Whether to mark horizontal lines. Defaults to False.
203242
line_spitter_strategy (str, optional): The line splitter strategy. Defaults to "left-priority".
243+
add_line_nos (bool, optional): Adds line numbers to the extracted text and saves line metadata,
244+
which can be queried later using the highlights API.
204245
lang (str, optional): The language of the document. Defaults to "eng".
205246
tag (str, optional): The tag for the document. Defaults to "default".
206247
filename (str, optional): The name of the file to store in reports. Defaults to "".
@@ -235,6 +276,7 @@ def whisper(
235276
"mark_vertical_lines": mark_vertical_lines,
236277
"mark_horizontal_lines": mark_horizontal_lines,
237278
"line_spitter_strategy": line_spitter_strategy,
279+
"add_line_nos": add_line_nos,
238280
"lang": lang,
239281
"tag": tag,
240282
"filename": filename,

tests/integration/client_v2_test.py

+43
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,49 @@ def test_whisper_v2_error(client_v2, data_dir, output_mode, mode, input_file):
7878
assert_error_message(whisper_result)
7979

8080

81+
@pytest.mark.parametrize(
82+
"input_file",
83+
[
84+
("credit_card.pdf"),
85+
],
86+
)
87+
def test_highlight(client_v2, data_dir, input_file):
88+
file_path = os.path.join(data_dir, input_file)
89+
90+
whisper_result = client_v2.whisper(
91+
add_line_nos=True,
92+
file_path=file_path,
93+
wait_for_completion=True,
94+
)
95+
whisper_hash = whisper_result["whisper_hash"]
96+
highlight_data = client_v2.get_highlight_data(whisper_hash=whisper_hash, lines="1-2")
97+
98+
# Assert the structure and content of highlight_data
99+
assert isinstance(highlight_data, dict)
100+
assert len(highlight_data) == 2
101+
assert "1" in highlight_data
102+
assert "2" in highlight_data
103+
104+
# Assert line 1 data
105+
line1 = highlight_data["1"]
106+
assert line1["base_y"] == 0
107+
assert line1["base_y_percent"] == 0
108+
assert line1["height"] == 0
109+
assert line1["height_percent"] == 0
110+
assert line1["page"] == 0
111+
assert line1["page_height"] == 0
112+
assert line1["raw"] == [0, 0, 0, 0]
113+
114+
# Assert line 2 data
115+
line2 = highlight_data["2"]
116+
assert line2["base_y"] == 155
117+
assert line2["base_y_percent"] == pytest.approx(4.8927) # Using approx for float comparison
118+
assert line2["height"] == 51
119+
assert line2["height_percent"] == pytest.approx(1.6098) # Using approx for float comparison
120+
assert line2["page"] == 0
121+
assert line2["page_height"] == 3168
122+
123+
81124
@pytest.mark.parametrize(
82125
"output_mode, mode, url, input_file, page_count",
83126
[

0 commit comments

Comments
 (0)