Merge pull request #18 from Zipstack/highlight_function

johnyrahul · web-flow · commit c3a23b59a937 · 2025-02-17T11:41:02.000+05:30
Highlight function
diff --git a/src/unstract/llmwhisperer/__init__.py b/src/unstract/llmwhisperer/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "2.1.0"
+__version__ = "2.2.0"
 
 from .client import LLMWhispererClient  # noqa: F401
 from .client_v2 import LLMWhispererClientV2  # noqa: F401
diff --git a/src/unstract/llmwhisperer/client_v2.py b/src/unstract/llmwhisperer/client_v2.py
@@ -155,6 +155,44 @@ def get_usage_info(self) -> dict:
             raise LLMWhispererClientException(err)
         return json.loads(response.text)
 
+    def get_highlight_data(self, whisper_hash: str, lines: str, extract_all_lines: bool = False) -> dict:
+        """Retrieves the highlight information of the LLMWhisperer API.
+
+        This method sends a GET request to the '/highlights' endpoint of the LLMWhisperer API.
+        The response is a JSON object containing the usage information.
+        Refer to https://docs.unstract.com/llm_whisperer/apis/llm_whisperer_usage_api
+
+        Args:
+            whisper_hash (str): The hash of the whisper operation.
+            lines (str): Define which lines metadata to retrieve.
+                You can specify which lines metadata to retrieve with this parameter.
+                Example 1-5,7,21- will retrieve lines metadata 1,2,3,4,5,7,21,22,23,24...
+                till the last line meta data.
+        Returns:
+            dict: A dictionary containing the highlight information.
+
+        Raises:
+            LLMWhispererClientException: If the API request fails, it raises an exception with
+                                          the error message and status code returned by the API.
+        """
+        self.logger.debug("highlight called")
+        url = f"{self.base_url}/highlights"
+        params = {
+            "whisper_hash": whisper_hash,
+            "lines": lines,
+            "extract_all_lines": extract_all_lines,
+        }
+        self.logger.debug("url: %s", url)
+        req = requests.Request("GET", url, headers=self.headers, params=params)
+        prepared = req.prepare()
+        s = requests.Session()
+        response = s.send(prepared, timeout=self.api_timeout)
+        if response.status_code != 200:
+            err = json.loads(response.text)
+            err["status_code"] = response.status_code
+            raise LLMWhispererClientException(err)
+        return json.loads(response.text)
+
     def whisper(
         self,
         file_path: str = "",
@@ -171,6 +209,7 @@ def whisper(
         mark_vertical_lines: bool = False,
         mark_horizontal_lines: bool = False,
         line_spitter_strategy: str = "left-priority",
+        add_line_nos: bool = False,
         lang="eng",
         tag="default",
         filename="",
@@ -201,6 +240,8 @@ def whisper(
             mark_vertical_lines (bool, optional): Whether to mark vertical lines. Defaults to False.
             mark_horizontal_lines (bool, optional): Whether to mark horizontal lines. Defaults to False.
             line_spitter_strategy (str, optional): The line splitter strategy. Defaults to "left-priority".
+            add_line_nos (bool, optional): Adds line numbers to the extracted text and saves line metadata,
+              which can be queried later using the highlights API.
             lang (str, optional): The language of the document. Defaults to "eng".
             tag (str, optional): The tag for the document. Defaults to "default".
             filename (str, optional): The name of the file to store in reports. Defaults to "".
@@ -235,6 +276,7 @@ def whisper(
             "mark_vertical_lines": mark_vertical_lines,
             "mark_horizontal_lines": mark_horizontal_lines,
             "line_spitter_strategy": line_spitter_strategy,
+            "add_line_nos": add_line_nos,
             "lang": lang,
             "tag": tag,
             "filename": filename,
diff --git a/tests/integration/client_v2_test.py b/tests/integration/client_v2_test.py
@@ -78,6 +78,49 @@ def test_whisper_v2_error(client_v2, data_dir, output_mode, mode, input_file):
     assert_error_message(whisper_result)
 
 
+@pytest.mark.parametrize(
+    "input_file",
+    [
+        ("credit_card.pdf"),
+    ],
+)
+def test_highlight(client_v2, data_dir, input_file):
+    file_path = os.path.join(data_dir, input_file)
+
+    whisper_result = client_v2.whisper(
+        add_line_nos=True,
+        file_path=file_path,
+        wait_for_completion=True,
+    )
+    whisper_hash = whisper_result["whisper_hash"]
+    highlight_data = client_v2.get_highlight_data(whisper_hash=whisper_hash, lines="1-2")
+
+    # Assert the structure and content of highlight_data
+    assert isinstance(highlight_data, dict)
+    assert len(highlight_data) == 2
+    assert "1" in highlight_data
+    assert "2" in highlight_data
+
+    # Assert line 1 data
+    line1 = highlight_data["1"]
+    assert line1["base_y"] == 0
+    assert line1["base_y_percent"] == 0
+    assert line1["height"] == 0
+    assert line1["height_percent"] == 0
+    assert line1["page"] == 0
+    assert line1["page_height"] == 0
+    assert line1["raw"] == [0, 0, 0, 0]
+
+    # Assert line 2 data
+    line2 = highlight_data["2"]
+    assert line2["base_y"] == 155
+    assert line2["base_y_percent"] == pytest.approx(4.8927)  # Using approx for float comparison
+    assert line2["height"] == 51
+    assert line2["height_percent"] == pytest.approx(1.6098)  # Using approx for float comparison
+    assert line2["page"] == 0
+    assert line2["page_height"] == 3168
+
+
 @pytest.mark.parametrize(
     "output_mode, mode, url, input_file, page_count",
     [

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "2.1.0"`
	`1`	`+__version__ = "2.2.0"`
`2`	`2`
`3`	`3`	`from .client import LLMWhispererClient # noqa: F401`
`4`	`4`	`from .client_v2 import LLMWhispererClientV2 # noqa: F401`