From c9e8ee488a7b05b8e1139253a52d82379885ac5e Mon Sep 17 00:00:00 2001 From: Scott Date: Mon, 31 Mar 2025 14:33:33 -0400 Subject: [PATCH 01/14] remove doc extraction, review, and dataset classes --- examples/copy_teach_task.py | 20 - .../create_auto_classification_workflow.py | 27 -- examples/dataset_tasks.py | 25 -- examples/merge_snapshots.py | 20 - examples/pdf_highlighter.py | 35 -- examples/submitting_to_doc_extraction.py | 24 -- indico_toolkit/auto_populate/__init__.py | 3 - indico_toolkit/auto_populate/populator.py | 362 ------------------ indico_toolkit/auto_populate/types.py | 66 ---- indico_toolkit/indico_wrapper/__init__.py | 6 - indico_toolkit/indico_wrapper/dataset.py | 114 ------ .../indico_wrapper/doc_extraction.py | 79 ---- indico_toolkit/indico_wrapper/reviewer.py | 111 ------ indico_toolkit/ocr/__init__.py | 2 - indico_toolkit/ocr/customocr_object.py | 42 -- mypy.ini | 3 - tests/integration/conftest.py | 33 +- .../indico_wrapper/test_dataset.py | 40 -- .../indico_wrapper/test_doc_extraction.py | 127 ------ .../indico_wrapper/test_reviewer.py | 60 --- tests/integration/ocr/__init__.py | 0 .../integration/ocr/test_customocr_object.py | 47 --- tests/integration/ocr/test_ondoc_object.py | 63 --- tests/integration/ocr/test_standard_object.py | 33 -- tests/integration/test_populator.py | 80 ---- 25 files changed, 1 insertion(+), 1421 deletions(-) delete mode 100644 examples/copy_teach_task.py delete mode 100644 examples/create_auto_classification_workflow.py delete mode 100644 examples/dataset_tasks.py delete mode 100644 examples/pdf_highlighter.py delete mode 100644 examples/submitting_to_doc_extraction.py delete mode 100644 indico_toolkit/auto_populate/__init__.py delete mode 100644 indico_toolkit/auto_populate/populator.py delete mode 100644 indico_toolkit/auto_populate/types.py delete mode 100644 indico_toolkit/indico_wrapper/dataset.py delete mode 100644 indico_toolkit/indico_wrapper/doc_extraction.py delete mode 100644 indico_toolkit/indico_wrapper/reviewer.py delete mode 100644 indico_toolkit/ocr/customocr_object.py delete mode 100644 tests/integration/indico_wrapper/test_dataset.py delete mode 100644 tests/integration/indico_wrapper/test_doc_extraction.py delete mode 100644 tests/integration/indico_wrapper/test_reviewer.py delete mode 100644 tests/integration/ocr/__init__.py delete mode 100644 tests/integration/ocr/test_customocr_object.py delete mode 100644 tests/integration/ocr/test_ondoc_object.py delete mode 100644 tests/integration/ocr/test_standard_object.py delete mode 100644 tests/integration/test_populator.py diff --git a/examples/copy_teach_task.py b/examples/copy_teach_task.py deleted file mode 100644 index d0b7f459..00000000 --- a/examples/copy_teach_task.py +++ /dev/null @@ -1,20 +0,0 @@ -from indico_toolkit import create_client -from indico_toolkit.auto_populate import AutoPopulator - -""" -Create a new copied Workflow based on given Teach Task Id -and corresponding Dataset Id. -""" - -HOST = "app.indico.io" -API_TOKEN_PATH = "./indico_api_token.txt" -DATASET_ID = 0 -TEACH_TASK_ID = 0 - -client = create_client(HOST, API_TOKEN_PATH) -auto_populator = AutoPopulator(client) -new_workflow = auto_populator.copy_teach_task( - dataset_id=DATASET_ID, - teach_task_id=TEACH_TASK_ID, - workflow_name="Copied Workflow", -) diff --git a/examples/create_auto_classification_workflow.py b/examples/create_auto_classification_workflow.py deleted file mode 100644 index 355adeb4..00000000 --- a/examples/create_auto_classification_workflow.py +++ /dev/null @@ -1,27 +0,0 @@ -from indico_toolkit import create_client -from indico_toolkit.auto_populate import AutoPopulator - -""" -Create an Indico Classification Workflow without any labeling using an organized -directory/folder structure. Each folder/directory should contain only one file type. - -For example, you would target '/base_directory/' if you had your files organized like: - -/base_directory/ -/base_directory/invoices/ -> contains only invoice files -/base_directory/disclosures/ -> contains only disclosure files -""" - -HOST = "app.indico.io" -API_TOKEN_PATH = "./indico_api_token.txt" - -DIRECTORY_FILE_PATH = "./base_directory/" - -client = create_client(HOST, API_TOKEN_PATH) -auto_populator = AutoPopulator(client) -new_workflow = auto_populator.create_auto_classification_workflow( - DIRECTORY_FILE_PATH, - "My dataset", - "My workflow", - "My teach task", -) diff --git a/examples/dataset_tasks.py b/examples/dataset_tasks.py deleted file mode 100644 index a53a2196..00000000 --- a/examples/dataset_tasks.py +++ /dev/null @@ -1,25 +0,0 @@ -from indico_toolkit import create_client -from indico_toolkit.indico_wrapper import Datasets, Download -from indico_toolkit.pipelines import FileProcessing - -DATASET_ID = 1234 -HOST = "app.indico.io" -API_TOKEN_PATH = "./indico_api_token.txt" - -# Instantiate the datasets class -client = create_client(HOST, API_TOKEN_PATH) -datasets = Datasets(client, DATASET_ID) -downloader = Download(client) -""" -Example 1: - -Upload files to an existing dataset in batches -""" -# Collect files to upload -fp = FileProcessing() -fp.get_file_paths_from_dir("./datasets/disclosures/") - -# Upload files to dataset in batches -for paths in fp.batch_files(batch_size=2): - datasets.add_files_to_dataset(paths) - print(f"Uploaded {len(paths)} files") diff --git a/examples/merge_snapshots.py b/examples/merge_snapshots.py index 6b004d3f..d204f206 100644 --- a/examples/merge_snapshots.py +++ b/examples/merge_snapshots.py @@ -1,9 +1,5 @@ -from indico_toolkit import create_client -from indico_toolkit.indico_wrapper import Datasets from indico_toolkit.snapshots import Snapshot -HOST = "app.indico.io" -API_TOKEN_PATH = "./indico_api_token.txt" PATH_TO_SNAPSHOT = "./snapshot_1.csv" PATH_TO_SNAPSHOT_2 = "./snapshot_2.csv" OUTPUT_PATH = "./merged_snapshot_output.csv" @@ -34,19 +30,3 @@ # will now include all of the samples from snap_to_append as well print(main_snap.number_of_samples) main_snap.to_csv(OUTPUT_PATH) - -""" -With that merged snapshot, you can now use the toolkit to upload and train a model. -""" -client = create_client(HOST, API_TOKEN_PATH) -dataset = Datasets(client) -uploaded_dataset = dataset.create_dataset([OUTPUT_PATH], dataset_name="my_dataset") -print(f"My Dataset ID is {uploaded_dataset.id}") -model = dataset.train_model( - uploaded_dataset, - model_name="my_model", - source_col=main_snap.text_col, - target_col=main_snap.label_col, - wait=False, -) -print(f"My Model Group ID is {model.id}") diff --git a/examples/pdf_highlighter.py b/examples/pdf_highlighter.py deleted file mode 100644 index 3540205b..00000000 --- a/examples/pdf_highlighter.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Highlight Indico Extraction Predictions on the source PDF -""" - -from indico_toolkit import create_client -from indico_toolkit.highlighter import Highlighter -from indico_toolkit.indico_wrapper import Workflow - -WORKFLOW_ID = 1418 -HOST = "app.indico.io" -API_TOKEN_PATH = "./indico_api_token.txt" -PATH_TO_DOCUMENT = "./mydocument.pdf" -# Instantiate the workflow class -client = create_client(HOST, API_TOKEN_PATH) -wflow = Workflow(client) - -# Get predictions and ondocument OCR object -submission_ids = wflow.submit_documents_to_workflow(WORKFLOW_ID, [PATH_TO_DOCUMENT]) -submission_result = wflow.get_submission_results_from_ids(submission_ids)[0] -ocr_object = wflow.get_ondoc_ocr_from_etl_url(submission_result.etl_url) - -# Highlight Predictions onto source document and write it to disc -highlighter = Highlighter(submission_result.predictions, PATH_TO_DOCUMENT) -highlighter.collect_tokens(ocr_object.token_objects) -highlighter.highlight_pdf("./highlighted_doc.pdf", ocr_object.page_heights_and_widths) - -# You can also have unique color highlights for each label group, write the label above -# the highlight, and add bookmarks of what labels appear on which pages -highlighter.highlight_pdf( - "./highlighted_doc.pdf", - ocr_object.page_heights_and_widths, - all_yellow_highlight=False, - add_label_annotations=True, - add_bookmarks=True, -) diff --git a/examples/submitting_to_doc_extraction.py b/examples/submitting_to_doc_extraction.py deleted file mode 100644 index 2bc20cc2..00000000 --- a/examples/submitting_to_doc_extraction.py +++ /dev/null @@ -1,24 +0,0 @@ -from indico_toolkit import create_client -from indico_toolkit.indico_wrapper import DocExtraction -from indico_toolkit.pipelines import FileProcessing - -""" -Retrieves a list of raw full document texts for all files in a folder -""" - -HOST = "app.indico.io" -API_TOKEN_PATH = "./indico_api_token.txt" - -# Instantiate the doc_extraction class -client = create_client(HOST, API_TOKEN_PATH) -doc_extraction = DocExtraction(client=client, preset_config="ondocument") - -# Collect files to submit -fp = FileProcessing() -fp.get_file_paths_from_dir("./datasets/disclosures/") - -# Submit documents with optional text setting and save results to variable -doc_texts = [] -for paths in fp.batch_files(batch_size=10): - doc_texts.append(doc_extraction.run_ocr(filepaths=paths, text_setting="full_text")) -print(doc_texts) diff --git a/indico_toolkit/auto_populate/__init__.py b/indico_toolkit/auto_populate/__init__.py deleted file mode 100644 index c3330c10..00000000 --- a/indico_toolkit/auto_populate/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .populator import AutoPopulator - -__all__ = ("AutoPopulator",) diff --git a/indico_toolkit/auto_populate/populator.py b/indico_toolkit/auto_populate/populator.py deleted file mode 100644 index 9b2e6396..00000000 --- a/indico_toolkit/auto_populate/populator.py +++ /dev/null @@ -1,362 +0,0 @@ -import dataclasses -import time -from json import loads -from os import PathLike -from pathlib import Path -from typing import Dict, List, Tuple, Union - -from indico import IndicoClient -from indico.queries import ( - CreateExport, - DownloadExport, - GetDataset, - GetModelGroup, -) -from indico.types import Workflow - -from ..errors import ToolkitPopulationError -from ..structure.create_structure import Structure -from .types import ( - Example, - ExampleList, - LabelInput, - LabelInst, - TokenSpanInput, -) - - -class AutoPopulator: - def __init__(self, client: IndicoClient): - """ - Module for manipulating and creating new workflows and teach tasks. - - Args: - client (IndicoClient): instantiated Indico Client - """ - self.client = client - self.structure = Structure(client) - self._exceptions = [] - - def create_auto_classification_workflow( - self, - directory_path: Union[str, PathLike[str]], - dataset_name: str, - workflow_name: str, - teach_task_name: str, - accepted_types: Tuple[str, ...] = ( - "csv", - "doc", - "docx", - "eml", - "jpeg", - "jpg", - "msg", - "pdf", - "png", - "pptx", - "rtf", - "svg", - "tif", - "tiff", - "txt", - "xls", - "xlsx", - ), - ) -> Workflow: - """ - Label and train a model based on a directory structure or existing teach task. - You should have a base directory containing sub directories where each - directory contains a unique file type and only that file type. - - Example: - base_directory/ - base_directory/invoices/ -> folder containing only invoices - base_directory/disclosures/ -> folder containing only disclosures - etc. etc. - Args: - directory_path (str): Path to a directory containing your filepath structure - dataset_name (str): Name of created dataset - worlflow_name (str): Name of created workflow - teach_task_name (str): Name of created teach task - accepted_types (Tuple[str], optional): List of accepted file types to search - Returns: - Workflow: a Workflow object representation of the newly created workflow - """ - - def valid_file(file: Path) -> bool: - return ( - file.is_file() and file.suffix.strip(".").casefold() in accepted_types - ) - - folder = Path(directory_path) - files = list(filter(valid_file, folder.glob("*/*"))) - classes = list(set(file.parent.name for file in files)) - labeled_files = {file.name: [{"label": file.parent.name}] for file in files} - - if len(classes) < 2: - raise ToolkitPopulationError( - "You must have documents in at least 2 directories, " - f"you only have {len(classes)}" - ) - - # Upload files to a new dataset. - dataset = self.structure.create_dataset( - dataset_name=dataset_name, - files_to_upload=files, - read_api=True, - single_column=False, - auto_rotate=False, - upscale_images=True, - languages=["ENG"], - ) - - # Create a new workflow with classification model. - workflow = self.structure.create_workflow(workflow_name, dataset.id) - workflow = self.structure.add_teach_task( - task_name=teach_task_name, - labelset_name=f"{teach_task_name}_labelset", - target_names=classes, - dataset_id=dataset.id, - workflow_id=workflow.id, - model_type="classification", - ) - teach_task_id = workflow.components[-1].model_group.questionnaire_id - labelset_id, model_group_id, label_map = self._get_teach_task_details( - teach_task_id - ) - - labels = self.get_labels_by_filename(model_group_id, labeled_files, label_map) - self.structure.label_teach_task( - label_set_id=labelset_id, - labels=list(map(dataclasses.asdict, labels)), - model_group_id=model_group_id, - ) - - return workflow - - def copy_teach_task( - self, - dataset_id: int, - teach_task_id: int, - workflow_name: str, - data_column: str = "document", - rename_labels: Dict[str, str] = None, - remove_labels: List[str] = None, - ) -> Workflow: - """ - Create duplicate teach task in same Indico platform. - - Note: Does not work with datasets created with a snapshot - - Args: - dataset_id (int): The dataset id of the dataset you wish to copy - teach_task_id (int): The teach task id of the corresponding teach task to - the dataset - workflow_name (string): The name of the newly created workflow - data_column_id (str, optional): The datacolumn id of the corresponding - dataset. Defaults to 'document' - rename_labels (dict, optional): Dictionary in format - {old_label_name : new_label_name} - remove_labels (list, optional): List of labels to remove from old teach task - - Returns: - Workflow: a Workflow object representation of the newly created workflow - """ - dataset = self.client.call(GetDataset(dataset_id)) - ( - old_labelset_id, - old_model_group_id, - old_target_name_map, - ) = self._get_teach_task_details(teach_task_id=teach_task_id) - # get dataset snapshot - export = self.client.call( - CreateExport(dataset_id=dataset.id, labelset_id=old_labelset_id, wait=True) - ) - csv = self.client.call(DownloadExport(export.id)) - print("Obtained snapshot") - - # create workflow - workflow = self.structure.create_workflow( - name=workflow_name, dataset_id=dataset_id - ) - time.sleep(2) - print("Created workflow") - old_model_group = self.client.call( - GetModelGroup(id=old_model_group_id, wait=True) - ) - model_type = old_model_group.task_type.lower() - # Create new teach task - workflow = self.structure.add_teach_task( - task_name=workflow_name, - labelset_name=workflow_name, - target_names=list(old_target_name_map.keys()), - dataset_id=dataset.id, - workflow_id=workflow.id, - model_type=model_type, - data_column=data_column, - ) - ( - new_labelset_id, - new_model_group_id, - new_target_name_map, - ) = self._get_teach_task_details( - workflow.components[-1].model_group.questionnaire_id - ) - # Get file_to_targets from export CSV - file_to_targets = {} - for _, row in csv.iterrows(): - # Check for NaN filled rows - if isinstance(row[2], float): - continue - old_example_id = row[0] - old_examples = self._get_example_list(old_model_group_id) - targets_list = loads(row[2])["targets"] - file_to_targets[old_examples.get_example(old_example_id).data_file_name] = ( - targets_list - ) - labels = self.get_labels_by_filename( - new_model_group_id, - file_to_targets, - new_target_name_map, - rename_labels, - remove_labels, - ) - # Label new teach task - result = self.structure.label_teach_task( - label_set_id=new_labelset_id, - labels=[dataclasses.asdict(label) for label in labels], - model_group_id=new_model_group_id, - ) - if not result["submitLabelsV2"]["success"]: - raise ToolkitPopulationError("Error: Failed to submit labels") - return workflow - - def inject_labels_into_teach_task( - self, - workflow_id: int, - teach_task_id: int, - file_to_targets: dict, - rename_labels: Dict[str, str] = None, - remove_labels: List[str] = None, - ): - """ - Add label data into existing teach task - - Args: - workflow_id (int): Id of the workflow you wish to add labels to - teach_task_id (int): Id of the corresponding teach task to the workflow - file_to_targets (dict): mapping of filenames to target label data - rename_labels (dict, optional): Dictionary in format - {old_label_name : new_label_name} - remove_labels (list, optional): List of labels to remove from old teach task - """ - ( - labelset_id, - model_group_id, - target_name_map, - ) = self._get_teach_task_details(teach_task_id) - labels = self.get_labels_by_filename( - model_group_id, - file_to_targets, - target_name_map, - rename_labels, - remove_labels, - ) - # Label new teach task - result = self.structure.label_teach_task( - label_set_id=labelset_id, - labels=[dataclasses.asdict(label) for label in labels], - model_group_id=model_group_id, - ) - if not result["submitLabelsV2"]["success"]: - raise ToolkitPopulationError("Error: Failed to submit labels") - - def get_labels_by_filename( - self, - model_group_id: int, - file_to_targets: dict, - target_name_map: dict, - rename_labels: Dict[str, str] = None, - remove_labels: List[str] = None, - ) -> List[LabelInput]: - """ - Args: - model_group_id (int): ID of the model group to be labeled - file_to_targets (dict): mapping in the format {filename : targets_list} - target_name_map (dict): mapping of field name to corresponding target ID - rename_labels (dict, optional): Dictionary in format - {old_label_name : new_label_name} - remove_labels (list, optional): List of labels to remove from old teach task - - Returns: - A list of LabelInput to be ingested by the platform via submitLabelsV2 - """ - labels = [] - # Retrieve examples and match against filename - examples = self._get_example_list(model_group_id) - - for filename, targets_list in file_to_targets.items(): - if rename_labels or remove_labels: - targets_list = self._edit_labels( - targets_list, rename_labels, remove_labels - ) - targets_list = self._convert_label(targets_list, target_name_map) - example_id = examples.get_example_id(filename) - if example_id: - labels.append(LabelInput(example_id, targets_list)) - return labels - - def _edit_labels( - self, - targets_list: List[dict], - rename_labels: Dict[str, str], - remove_labels: List[str], - ): - new_targets_list = [] - for target in targets_list: - if remove_labels and target["label"] not in remove_labels: - if rename_labels and rename_labels.get(target["label"]): - target["label"] = rename_labels[target["label"]] - new_targets_list.append(target) - return new_targets_list - - def _convert_label( - self, targets_list: List[dict], target_name_map: dict - ) -> List[LabelInst]: - updated_labels = [] - for target in targets_list: - updated_label = LabelInst(target_name_map[target["label"]]) - if target.get("spans"): - updated_spans = [ - TokenSpanInput(span["start"], span["end"], span["page_num"]) - for span in target["spans"] - ] - updated_label.spans = updated_spans - updated_labels.append(updated_label) - return updated_labels - - def _get_teach_task_details(self, teach_task_id: int): - teach_task_details = self.structure.get_teach_details( - teach_task_id=teach_task_id - ) - labelset_id = teach_task_details["questionnaire"]["question"]["labelset"]["id"] - model_group_id = teach_task_details["questionnaire"]["question"]["modelGroupId"] - target_names = teach_task_details["questionnaire"]["question"]["labelset"][ - "targetNames" - ] - target_name_map = {} - for target in target_names: - target_name_map[target["name"]] = target["id"] - return labelset_id, model_group_id, target_name_map - - def _get_example_list(self, model_group_id: int, limit=1000): - examples = self.structure.get_example_ids( - model_group_id=model_group_id, limit=limit - ) - examples = ExampleList( - examples=[ - Example(i["id"], i["datafile"]["name"]) - for i in examples["modelGroup"]["pagedExamples"]["examples"] - ] - ) - return examples diff --git a/indico_toolkit/auto_populate/types.py b/indico_toolkit/auto_populate/types.py deleted file mode 100644 index dae2460d..00000000 --- a/indico_toolkit/auto_populate/types.py +++ /dev/null @@ -1,66 +0,0 @@ -from dataclasses import dataclass -from typing import List - - -@dataclass -class Example: - id: int - data_file_name: str - - -class ExampleList: - def __init__(self, examples: List[Example]): - self.examples = examples - - def get_example(self, example_id: int) -> Example: - """ - Returns example with matching example_id. If no matching example id found, - return None. - """ - for example in self.examples: - if example.id == example_id: - return example - return None - - def get_example_id(self, example_data_file_name: str) -> int: - """ - Returns id for a specific example with the same name as example_data_file_name. - If no matching example found, return None. Assumes no duplicate filenames in - dataset - """ - for example in self.examples: - if example.data_file_name == example_data_file_name: - return example.id - return None - - -@dataclass -class TokenSpanInput: - start: int - end: int - pageNum: int - - -@dataclass -class SpatialSpanInput: - top: int - bottom: int - left: int - right: int - pageNum: int - - -@dataclass -class LabelInst: - clsId: int - spans: List[TokenSpanInput] = None - bounds: List[SpatialSpanInput] = None - - -@dataclass -class LabelInput: - exampleId: int - targets: List[LabelInst] - rejected: bool = None - override: bool = None - partial: bool = None diff --git a/indico_toolkit/indico_wrapper/__init__.py b/indico_toolkit/indico_wrapper/__init__.py index 40dd45aa..faa39014 100644 --- a/indico_toolkit/indico_wrapper/__init__.py +++ b/indico_toolkit/indico_wrapper/__init__.py @@ -1,15 +1,9 @@ -from .dataset import Datasets -from .doc_extraction import DocExtraction from .download import Download from .indico_wrapper import IndicoWrapper -from .reviewer import Reviewer from .workflow import Workflow __all__ = ( - "Datasets", - "DocExtraction", "Download", "IndicoWrapper", - "Reviewer", "Workflow", ) diff --git a/indico_toolkit/indico_wrapper/dataset.py b/indico_toolkit/indico_wrapper/dataset.py deleted file mode 100644 index a954d5ab..00000000 --- a/indico_toolkit/indico_wrapper/dataset.py +++ /dev/null @@ -1,114 +0,0 @@ -from typing import List - -from indico import IndicoClient -from indico.queries import ( - AddDataToWorkflow, - AddFiles, - CreateDataset, - CreateEmptyDataset, - DeleteDataset, - GetDataset, -) -from indico.types import Dataset, OcrEngine, Workflow - -from .indico_wrapper import IndicoWrapper - - -class Datasets(IndicoWrapper): - def __init__(self, client: IndicoClient): - self.client = client - - def get_dataset(self, dataset_id: int): - return self.client.call(GetDataset(dataset_id)) - - def add_files_to_dataset(self, dataset_id: int, filepaths: List[str]) -> Dataset: - """ - Upload documents to an existing dataset and wait for them to OCR - """ - dataset = self.client.call( - AddFiles( - dataset_id=dataset_id, files=filepaths, autoprocess=True, wait=True - ) - ) - return dataset - - def add_new_files_to_task(self, workflow_id: id, wait: bool = True) -> Workflow: - """ - Add newly uploaded documents to an existing teach task given the task's - associated workflow ID - - Args: - workflow_id (id): workflow ID associated with teach task - wait (bool, optional): wait for data to be added. Defaults to True. - """ - workflow = self.client.call(AddDataToWorkflow(workflow_id, wait)) - if wait: - print(f"Data added to all teach tasks associated with {workflow.id}") - return workflow - - def create_empty_dataset( - self, - dataset_name: str, - dataset_type: str = "DOCUMENT", - ocr_engine: OcrEngine = OcrEngine.READAPI, - ) -> Dataset: - """ - Create an empty dataset - Args: - name (str): Name of the dataset - dataset_type (str, optional): TEXT, IMAGE, or DOCUMENT. - Defaults to "DOCUMENT". - """ - return self.client.call( - CreateEmptyDataset(dataset_name, dataset_type, ocr_engine) - ) - - def create_dataset( - self, - filepaths: List[str], - dataset_name: str, - ocr_engine: OcrEngine = OcrEngine.READAPI, - ) -> Dataset: - dataset = self.client.call( - CreateDataset( - name=dataset_name, - files=filepaths, - ocr_engine=ocr_engine, - ) - ) - self.dataset_id = dataset.id - return dataset - - def delete_dataset(self, dataset_id: int) -> bool: - """ - Returns True if operation is succesful - """ - return self.client.call(DeleteDataset(id=dataset_id)) - - def get_dataset_metadata(self, dataset_id: int) -> List[dict]: - """ - Get list of dataset files with information like file name, status, and number of - pages - """ - query = """ - query GetDataset($id: Int) { - dataset(id: $id) { - id - name - files { - id - name - numPages - status - } - } - } - """ - dataset = self.graphQL_request( - graphql_query=query, variables={"id": dataset_id} - ) - return dataset["dataset"]["files"] - - def get_col_name_by_id(self, dataset_id: int, col_id: int) -> str: - dataset = self.get_dataset(dataset_id) - return next(c.name for c in dataset.datacolumns if c.id == col_id) diff --git a/indico_toolkit/indico_wrapper/doc_extraction.py b/indico_toolkit/indico_wrapper/doc_extraction.py deleted file mode 100644 index 3a10170d..00000000 --- a/indico_toolkit/indico_wrapper/doc_extraction.py +++ /dev/null @@ -1,79 +0,0 @@ -from typing import List, Union - -from indico import IndicoClient -from indico.queries import DocumentExtraction, Job - -from ..ocr import CustomOcr, OnDoc, StandardOcr -from .indico_wrapper import IndicoWrapper - - -class DocExtraction(IndicoWrapper): - """ - Class to support DocumentExtraction-related API calls - """ - - def __init__( - self, - client: IndicoClient, - preset_config: str = "standard", - custom_config: dict = None, - ): - """ - Args: - preset_config (str): Options are simple, legacy, detailed, ondocument, and - standard. - """ - self._preset_config = preset_config - self.client = client - self.json_config = {"preset_config": preset_config} - if custom_config: - self.json_config = custom_config - - def run_ocr( - self, filepaths: List[str], text_setting: str = None - ) -> List[Union[StandardOcr, OnDoc, CustomOcr, str]]: - """ - Args: - filepaths (List[str]): List of paths to local documents you would like to - submit for extraction - text_setting (str): Options are full_text and page_texts. - - Returns: - extracted_data (List[Union[StandardOcr, OnDoc, CustomOcr, str]]): data from - DocumentExtraction converted to OCR objects or string text - """ - jobs = self._submit_to_ocr(filepaths) - extracted_data = [] - for ind, job in enumerate(jobs): - status = self.get_job_status(job.id, True) - if status.status == "SUCCESS": - result = self.get_storage_object(status.result) - if text_setting == "full_text": - extracted_data.append(self._convert_ocr_objects(result).full_text) - elif text_setting == "page_texts": - extracted_data.append(self._convert_ocr_objects(result).page_texts) - else: - extracted_data.append(self._convert_ocr_objects(result)) - else: - raise RuntimeError( - f"{filepaths[ind]} {status.status}: {status.result}." - ) - return extracted_data - - def _submit_to_ocr(self, filepaths: List[str]) -> List[Job]: - return self.client.call( - DocumentExtraction(files=filepaths, json_config=self.json_config) - ) - - def _convert_ocr_objects( - self, extracted_data: Union[List[dict], dict] - ) -> Union[StandardOcr, OnDoc, CustomOcr]: - if self.json_config == {"preset_config": "ondocument"}: - return OnDoc(extracted_data) - elif ( - self.json_config == {"preset_config": "standard"} - or self.json_config is None - ): - return StandardOcr(extracted_data) - else: - return CustomOcr(extracted_data) diff --git a/indico_toolkit/indico_wrapper/reviewer.py b/indico_toolkit/indico_wrapper/reviewer.py deleted file mode 100644 index faf6e9a4..00000000 --- a/indico_toolkit/indico_wrapper/reviewer.py +++ /dev/null @@ -1,111 +0,0 @@ -import json - -from indico import IndicoClient - -from .indico_wrapper import Workflow - - -class Reviewer(Workflow): - """ - Class to simulate human reviewer - """ - - def __init__( - self, - client: IndicoClient, - workflow_id: int, - ): - self.client = client - self.workflow_id = workflow_id - - def accept_review(self, submission_id: int, changes: dict) -> None: - """ - Accept a submission in the review queue - Args: - submission_id (int): submission ID - changes (dict): accepted predictions with format like, - e.g. {"model_name": [{"label"...}]} - """ - self.graphQL_request( - SUBMIT_REVIEW, - { - "rejected": False, - "submissionId": submission_id, - "changes": json.dumps(changes), - }, - ) - - def get_random_review_id(self): - response = self.graphQL_request( - GET_RANDOM_REVIEW_ID, {"workflowId": self.workflow_id} - ) - try: - return response["randomSubmission"]["id"] - except Exception: - raise RuntimeError("The review queue is empty") - - def get_random_exception_id(self): - response = self.graphQL_request( - GET_RANDOM_EXCEPTION_ID, {"workflowId": self.workflow_id} - ) - try: - return response["randomSubmission"]["id"] - except Exception: - raise RuntimeError("The exception queue is empty") - - def reject_submission(self, submission_id): - return self.graphQL_request( - SUBMIT_REVIEW, {"rejected": True, "submissionId": submission_id} - ) - - -SUBMIT_REVIEW = """ -mutation submitStandardQueue( - $changes: JSONString, - $rejected: Boolean, - $submissionId: Int!, - $notes: String -) { - submitReview( - changes: $changes, - rejected: $rejected, - submissionId: $submissionId, - notes: $notes -) { - id - __typename - } -} -""" - -GET_RANDOM_EXCEPTION_ID = """ -query getExceptionsSubmission($workflowId: Int!) { - randomSubmission(adminReview: true, workflowId: $workflowId) { - id - resultFile - inputFilename - autoReview { - id - changes - __typename - } - __typename - } -} -""" - -GET_RANDOM_REVIEW_ID = """ -query getSubmission($workflowId: Int!) { - randomSubmission(adminReview: false, workflowId: $workflowId) { - id - resultFile - inputFilename - autoReview { - id - changes - __typename - } - __typename - } -} -""" diff --git a/indico_toolkit/ocr/__init__.py b/indico_toolkit/ocr/__init__.py index fbf15bf5..7d728eac 100644 --- a/indico_toolkit/ocr/__init__.py +++ b/indico_toolkit/ocr/__init__.py @@ -1,9 +1,7 @@ -from .customocr_object import CustomOcr from .ondoc_object import OnDoc from .standard_object import StandardOcr __all__ = ( - "CustomOcr", "OnDoc", "StandardOcr", ) diff --git a/indico_toolkit/ocr/customocr_object.py b/indico_toolkit/ocr/customocr_object.py deleted file mode 100644 index 78e1c79c..00000000 --- a/indico_toolkit/ocr/customocr_object.py +++ /dev/null @@ -1,42 +0,0 @@ -from typing import List, Union - - -class CustomOcr: - """ - CustomOcr is a helper class for the raw preset config OCR results. Enables easy - extraction of full text and page-level text. - """ - - def __init__(self, customocr: Union[List[dict], dict]): - """ - customocr Union[List[dict], dict]: result object from - indico.queries.DocumentExtraction - """ - self.customocr = customocr - - @property - def full_text(self) -> str: - """ - Return full document text as string - """ - if isinstance(self.customocr, dict) and "text" in self.customocr: - return self.customocr["text"] - elif isinstance(self.customocr, dict) and "pages" in self.customocr: - if "text" in self.customocr["pages"][0]: - return "\n".join(page["text"] for page in self.customocr["pages"]) - elif isinstance(self.customocr, list) and "pages" in self.customocr[0]: - if "text" in self.customocr[0]["pages"][0]: - return "\n".join(page["pages"][0]["text"] for page in self.customocr) - raise RuntimeError("JSON configuration setting does not have full text.") - - @property - def page_texts(self) -> List[str]: - """ - Return list of page-level text - """ - if isinstance(self.customocr, dict) and "pages" in self.customocr: - return [page["text"] for page in self.customocr["pages"]] - elif isinstance(self.customocr, list) and "pages" in self.customocr[0]: - if "text" in self.customocr[0]["pages"][0]: - return [page["pages"][0]["text"] for page in self.customocr] - raise RuntimeError("JSON configuration setting does not have page-level text.") diff --git a/mypy.ini b/mypy.ini index a205789e..fd98fdd6 100644 --- a/mypy.ini +++ b/mypy.ini @@ -18,9 +18,6 @@ ignore_missing_imports = True [mypy-indico_toolkit.association.*] ignore_errors = True -[mypy-indico_toolkit.auto_populate.*] -ignore_errors = True - [mypy-indico_toolkit.auto_review.*] ignore_errors = True diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index b0c3d356..ec91e109 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -6,14 +6,11 @@ AddModelGroupComponent, CreateDataset, CreateWorkflow, - DocumentExtraction, GetTrainingModelWithProgress, GraphQLRequest, - JobStatus, - RetrieveStorageObject, ) -from indico_toolkit.indico_wrapper import DocExtraction, Workflow +from indico_toolkit.indico_wrapper import Workflow def pytest_addoption(parser: pytest.Parser) -> None: @@ -62,11 +59,6 @@ def dataset_id(dataset): return dataset.id -@pytest.fixture(scope="session") -def doc_extraction_standard(indico_client): - return DocExtraction(indico_client) - - @pytest.fixture(scope="session") def extraction_model_group_id(workflow): return workflow.components[-1].model_group.id @@ -102,18 +94,6 @@ def module_submission_ids(workflow_id, indico_client, pdf_file): return sub_ids -@pytest.fixture(scope="session") -def ondoc_ocr_object(indico_client, pdf_file): - job = indico_client.call( - DocumentExtraction( - files=[pdf_file], json_config={"preset_config": "ondocument"} - ) - ) - job = indico_client.call(JobStatus(id=job[0].id, wait=True)) - extracted_data = indico_client.call(RetrieveStorageObject(job.result)) - return extracted_data - - @pytest.fixture(scope="session") def pdf_file(tests_folder: Path) -> Path: return tests_folder / "data/samples/fin_disc.pdf" @@ -124,17 +104,6 @@ def populator_snapshot_file(tests_folder: Path) -> Path: return tests_folder / "data/snapshots/populator_snapshot.csv" -@pytest.fixture(scope="session") -def standard_ocr_object(indico_client, pdf_file): - # TODO: this can be static-- probably should be "ondoc" as well - job = indico_client.call( - DocumentExtraction(files=[pdf_file], json_config={"preset_config": "standard"}) - ) - job = indico_client.call(JobStatus(id=job[0].id, wait=True)) - extracted_data = indico_client.call(RetrieveStorageObject(job.result)) - return extracted_data - - @pytest.fixture(scope="session") def teach_task_id(workflow): return workflow.components[-1].model_group.questionnaire_id diff --git a/tests/integration/indico_wrapper/test_dataset.py b/tests/integration/indico_wrapper/test_dataset.py deleted file mode 100644 index 2b785071..00000000 --- a/tests/integration/indico_wrapper/test_dataset.py +++ /dev/null @@ -1,40 +0,0 @@ -""" -Test Datasets class methods -""" - -import pytest -from indico.types import Dataset - -from indico_toolkit.indico_wrapper import Datasets - - -@pytest.fixture(scope="module") -def dataset_wrapper(indico_client): - return Datasets(indico_client) - - -def test_get_dataset(dataset_wrapper, dataset_id): - dataset = dataset_wrapper.get_dataset(dataset_id) - assert isinstance(dataset, Dataset) - - -def test_add_to_dataset(dataset_wrapper, dataset_id, pdf_file): - dataset = dataset_wrapper.add_files_to_dataset(dataset_id, filepaths=[pdf_file]) - assert isinstance(dataset, Dataset) - for f in dataset.files: - assert f.status in ["PROCESSED", "FAILED"] - - -def test_get_dataset_files(dataset_wrapper, dataset_id): - files_list = dataset_wrapper.get_dataset_metadata(dataset_id) - assert isinstance(files_list, list) - assert len(files_list) > 0 - - -def test_create_delete_dataset(dataset_wrapper, pdf_file): - dataset = dataset_wrapper.create_dataset( - filepaths=[pdf_file], dataset_name="Toolkit Integration Tests" - ) - assert isinstance(dataset, Dataset) - status = dataset_wrapper.delete_dataset(dataset.id) - assert status diff --git a/tests/integration/indico_wrapper/test_doc_extraction.py b/tests/integration/indico_wrapper/test_doc_extraction.py deleted file mode 100644 index 7917f68d..00000000 --- a/tests/integration/indico_wrapper/test_doc_extraction.py +++ /dev/null @@ -1,127 +0,0 @@ -from indico_toolkit.indico_wrapper import DocExtraction -from indico_toolkit.ocr import OnDoc, StandardOcr - - -def test_run_ocr_ondoc(indico_client, pdf_file): - doc_extraction_ondoc = DocExtraction(indico_client, preset_config="ondocument") - extracted_data = doc_extraction_ondoc.run_ocr(filepaths=[pdf_file]) - for item in extracted_data: - assert isinstance(item, OnDoc) - - -def test_run_ocr_standard(doc_extraction_standard, pdf_file): - extracted_data = doc_extraction_standard.run_ocr(filepaths=[pdf_file]) - for item in extracted_data: - assert isinstance(item, StandardOcr) - - -def test_run_ocr_standard_full_text(doc_extraction_standard, pdf_file): - full_text_result = doc_extraction_standard.run_ocr( - filepaths=[pdf_file], text_setting="full_text" - ) - assert len(full_text_result[0]) == 2062 - - -def test_run_ocr_standard_page_texts(doc_extraction_standard, pdf_file): - page_texts_result = doc_extraction_standard.run_ocr( - filepaths=[pdf_file], text_setting="page_texts" - ) - assert len(page_texts_result[0][0]) == 1153 - - -def test_run_ocr_custom_full_text(indico_client, pdf_file): - doc_extraction_custom = DocExtraction( - indico_client, - custom_config={ - "top_level": "page", - "nest": False, - "reblocking": ["style", "list", "inline-header"], - "pages": [ - "text", - "size", - "dpi", - "doc_offset", - "page_num", - "image", - "thumbnail", - ], - "blocks": [ - "text", - "doc_offset", - "page_offset", - "position", - "block_type", - "page_num", - ], - "tokens": [ - "text", - "doc_offset", - "page_offset", - "block_offset", - "position", - "page_num", - "style", - ], - "chars": [ - "text", - "doc_index", - "block_index", - "page_index", - "page_num", - "position", - ], - }, - ) - full_text_result = doc_extraction_custom.run_ocr( - filepaths=[pdf_file], text_setting="full_text" - ) - assert len(full_text_result[0]) == 2067 - - -def test_run_ocr_custom_page_texts(indico_client, pdf_file): - doc_extraction_custom = DocExtraction( - indico_client, - custom_config={ - "top_level": "page", - "nest": False, - "reblocking": ["style", "list", "inline-header"], - "pages": [ - "text", - "size", - "dpi", - "doc_offset", - "page_num", - "image", - "thumbnail", - ], - "blocks": [ - "text", - "doc_offset", - "page_offset", - "position", - "block_type", - "page_num", - ], - "tokens": [ - "text", - "doc_offset", - "page_offset", - "block_offset", - "position", - "page_num", - "style", - ], - "chars": [ - "text", - "doc_index", - "block_index", - "page_index", - "page_num", - "position", - ], - }, - ) - page_texts_result = doc_extraction_custom.run_ocr( - filepaths=[pdf_file], text_setting="page_texts" - ) - assert len(page_texts_result[0][0]) == 1158 diff --git a/tests/integration/indico_wrapper/test_reviewer.py b/tests/integration/indico_wrapper/test_reviewer.py deleted file mode 100644 index 9b26a4ce..00000000 --- a/tests/integration/indico_wrapper/test_reviewer.py +++ /dev/null @@ -1,60 +0,0 @@ -import pytest - -from indico_toolkit.indico_wrapper import Reviewer, Workflow - - -@pytest.fixture(scope="module") -def submissions_awaiting_review(workflow_id, indico_client, pdf_file): - """ - Ensure that auto review is turned off and there are two submissions "PENDING_REVIEW" - """ - workflow_wrapper = Workflow(indico_client) - workflow_wrapper.update_workflow_settings( - workflow_id, enable_review=True, enable_auto_review=False - ) - sub_ids = workflow_wrapper.submit_documents_to_workflow( - workflow_id, files=[pdf_file, pdf_file] - ) - workflow_wrapper.wait_for_submissions_to_process(sub_ids) - - -def get_change_formatted_predictions(workflow_result): - """ - Helper function for get change format for accepted predictions in test_accept_review - """ - return {workflow_result.model_name: workflow_result.get_predictions.to_list()} - - -@pytest.mark.skip(reason="broken on indico-client>=6.1.0") -def test_accept_review(submissions_awaiting_review, indico_client, workflow_id): - reviewer_wrapper = Reviewer(indico_client, workflow_id) - id_in_review = reviewer_wrapper.get_random_review_id() - submission = reviewer_wrapper.get_submission_object(id_in_review) - assert submission.status == "PENDING_REVIEW" - predictions = reviewer_wrapper.get_submission_results_from_ids([id_in_review]) - changes = get_change_formatted_predictions(predictions[0]) - reviewer_wrapper.accept_review(id_in_review, changes) - submission = reviewer_wrapper.get_submission_object(id_in_review) - assert submission.status == "COMPLETE" - - -@pytest.mark.skip(reason="flaky, depends on submission processing time") -def test_reject_from_review(submissions_awaiting_review, indico_client, workflow_id): - reviewer_wrapper = Reviewer(indico_client, workflow_id) - id_in_review = reviewer_wrapper.get_random_review_id() - reviewer_wrapper.reject_submission(id_in_review) - submission = reviewer_wrapper.get_submission_object(id_in_review) - assert submission.status == "PENDING_ADMIN_REVIEW" - - -@pytest.mark.skip(reason="flaky, depends on submission processing time") -def test_reject_from_admin_review( - submissions_awaiting_review, indico_client, workflow_id -): - reviewer_wrapper = Reviewer(indico_client, workflow_id) - id_in_exception = reviewer_wrapper.get_random_exception_id() - submission = reviewer_wrapper.get_submission_object(id_in_exception) - assert submission.status == "PENDING_ADMIN_REVIEW" - reviewer_wrapper.reject_submission(id_in_exception) - submission = reviewer_wrapper.get_submission_object(id_in_exception) - assert submission.status == "COMPLETE" diff --git a/tests/integration/ocr/__init__.py b/tests/integration/ocr/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/integration/ocr/test_customocr_object.py b/tests/integration/ocr/test_customocr_object.py deleted file mode 100644 index 078ca7f2..00000000 --- a/tests/integration/ocr/test_customocr_object.py +++ /dev/null @@ -1,47 +0,0 @@ -import pytest - -from indico_toolkit.indico_wrapper import DocExtraction - - -def test_full_text(indico_client, pdf_file): - doc_extraction = DocExtraction(indico_client, preset_config="simple") - custom_ocr = doc_extraction.run_ocr(filepaths=[pdf_file]) - assert len(custom_ocr[0].full_text) == 2823 - - -def test_full_text_exception(indico_client, pdf_file): - doc_extraction = DocExtraction( - indico_client, - custom_config={ - "nest": True, - "top_level": "document", - "native_pdf": True, - "blocks": ["text", "position", "doc_offset", "page_offset"], - }, - ) - custom_ocr = doc_extraction.run_ocr(filepaths=[pdf_file]) - with pytest.raises(Exception): - custom_ocr[0].full_text - - -def test_page_texts(indico_client, pdf_file): - doc_extraction = DocExtraction( - indico_client, - custom_config={ - "nest": True, - "top_level": "document", - "native_pdf": True, - "pages": ["text", "size", "dpi", "doc_offset", "page_num", "image"], - "blocks": ["text", "position", "doc_offset", "page_offset"], - }, - ) - custom_ocr = doc_extraction.run_ocr(filepaths=[pdf_file]) - assert isinstance(custom_ocr[0].page_texts, list) - assert isinstance(custom_ocr[0].page_texts[0], str) - - -def test_page_texts_exception(indico_client, pdf_file): - doc_extraction = DocExtraction(indico_client, preset_config="legacy") - custom_ocr = doc_extraction.run_ocr(filepaths=[pdf_file]) - with pytest.raises(Exception): - custom_ocr.page_texts diff --git a/tests/integration/ocr/test_ondoc_object.py b/tests/integration/ocr/test_ondoc_object.py deleted file mode 100644 index e5f49c6b..00000000 --- a/tests/integration/ocr/test_ondoc_object.py +++ /dev/null @@ -1,63 +0,0 @@ -import pytest - -from indico_toolkit.ocr import OnDoc - - -def test_ondoc_full_text(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert len(ondoc_ocr.full_text) == 2067 - - -def test_ondoc_page_texts(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert len(ondoc_ocr.page_texts) == 2 - assert len(ondoc_ocr.page_texts[0]) == 1158 - - -def test_ondoc_page_results(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert len(ondoc_ocr.page_results) == 2 - assert len(ondoc_ocr.page_results[0]) == 8 - - -def test_ondoc_block_texts(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert len(ondoc_ocr.block_texts) == 41 - - -def test_ondoc_token_objects(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert len(ondoc_ocr.token_objects) == 304 - - -def test_ondoc_total_pages(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert ondoc_ocr.total_pages == 2 - - -def test_ondoc_total_characters(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert ondoc_ocr.total_characters == 2067 - - -def test_ondoc_total_tokens(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert ondoc_ocr.total_tokens == 304 - - -def test_ondoc_confidence(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert isinstance(ondoc_ocr.ocr_confidence("mean"), float) - assert 1 <= ondoc_ocr.ocr_confidence("mean") <= 100 - - -def test_ondoc_confidence_metric_exception(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - with pytest.raises(Exception): - ondoc_ocr.ocr_confidence("average") - - -def test_ondoc_excluded_confidence_exception(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object[0]["chars"][0].pop("confidence")) - with pytest.raises(Exception): - ondoc_ocr.ocr_confidence("mean") diff --git a/tests/integration/ocr/test_standard_object.py b/tests/integration/ocr/test_standard_object.py deleted file mode 100644 index 4753b8da..00000000 --- a/tests/integration/ocr/test_standard_object.py +++ /dev/null @@ -1,33 +0,0 @@ -from indico_toolkit.ocr import StandardOcr - - -def test_standard_object_full_text(standard_ocr_object): - standard_ocr = StandardOcr(standard_ocr_object) - assert len(standard_ocr.full_text) == 2062 - - -def test_standard_object_page_texts(standard_ocr_object): - standard_ocr = StandardOcr(standard_ocr_object) - assert len(standard_ocr.page_texts) == 2 - assert len(standard_ocr.page_texts[0]) == 1153 - - -def test_standard_object_page_results(standard_ocr_object): - standard_ocr = StandardOcr(standard_ocr_object) - assert len(standard_ocr.page_results) == 2 - assert len(standard_ocr.page_results[0]) == 4 - - -def test_standard_object_block_texts(standard_ocr_object): - standard_ocr = StandardOcr(standard_ocr_object) - assert len(standard_ocr.block_texts) == 36 - - -def test_standard_object_total_pages(standard_ocr_object): - standard_ocr = StandardOcr(standard_ocr_object) - assert standard_ocr.total_pages == 2 - - -def test_standard_object_total_characters(standard_ocr_object): - standard_ocr = StandardOcr(standard_ocr_object) - assert standard_ocr.total_characters == 2062 diff --git a/tests/integration/test_populator.py b/tests/integration/test_populator.py deleted file mode 100644 index bd2981fd..00000000 --- a/tests/integration/test_populator.py +++ /dev/null @@ -1,80 +0,0 @@ -import json -import os - -import pytest -from indico.queries import GetWorkflow -from indico.types import Workflow - -from indico_toolkit.auto_populate import AutoPopulator -from indico_toolkit.auto_populate.types import LabelInput, LabelInst - -pd = pytest.importorskip("pandas") - - -@pytest.fixture(scope="function") -def static_file_to_targets(populator_snapshot_file): - df = pd.read_csv(populator_snapshot_file) - file_to_targets = {} - for file, target in zip( - df["file_name_1820"].to_list(), df["Toolkit Test Financial Model"].to_list() - ): - if not isinstance(target, float): - file_to_targets[file] = json.loads(target)["targets"] - return file_to_targets - - -def test_create_classification_workflow(indico_client, tests_folder): - auto_populator = AutoPopulator(indico_client) - new_workflow = auto_populator.create_auto_classification_workflow( - os.path.join(tests_folder, "data/auto_class"), - "My dataset", - "My workflow", - "My teach task", - ) - assert isinstance(new_workflow, Workflow) - - -def test_create_classification_workflow_too_few_classes(indico_client, tests_folder): - auto_populator = AutoPopulator(indico_client) - with pytest.raises(Exception): - auto_populator.create_auto_classification_workflow( - os.path.join(tests_folder, "data/auto_class/class_a/"), - "My dataset", - "My workflow", - "My teach task", - ) - - -def test_copy_teach_task(indico_client, dataset, workflow_id, teach_task_id): - auto_populator = AutoPopulator(indico_client) - original_workflow = indico_client.call(GetWorkflow(workflow_id)) - new_workflow = auto_populator.copy_teach_task( - dataset_id=dataset.id, - teach_task_id=teach_task_id, - workflow_name=f"{original_workflow.name}_Copied", - data_column="text", - ) - assert isinstance(new_workflow, Workflow) - - -def test_get_labels_by_filename( - indico_client, - extraction_model_group_id, - teach_task_id, - static_file_to_targets, -): - populator = AutoPopulator(indico_client) - ( - labelset_id, - model_group_id, - target_name_map, - ) = populator._get_teach_task_details(teach_task_id) - - labels = populator.get_labels_by_filename( - extraction_model_group_id, static_file_to_targets, target_name_map - ) - assert len(labels) != 0 - for label in labels: - assert isinstance(label, LabelInput) - for target in label.targets: - assert isinstance(target, LabelInst) From c283802eac959b53f57bb144f675d1a6dd34d2f1 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Tue, 1 Apr 2025 08:33:41 -0500 Subject: [PATCH 02/14] Copy mypy config from pyproject.toml since mypy won't merge them --- mypy.ini | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mypy.ini b/mypy.ini index a205789e..582db9bb 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,7 +1,7 @@ -[mypy] # -# pyproject.toml contains the main configuration for mypy, which applies to existing -# modules that are type hinted and any new modules that are added. +# pyproject.toml contains the main configuration for mypy, which is copied here as mypy +# will not merge the configs. This config applies to existing modules that are type +# hinted and any new modules that are added. # # This file contains overrides to ignore errors in older modules and dependencies that # aren't type hinted, with the intention that these errors are fixed over time so that @@ -11,6 +11,11 @@ # address the errors it finds, repeating until the module passes. Once it passes, # remove the override from the list and commit the changes. # +[mypy] +strict = true +show_error_codes = true +warn_unreachable = true +disallow_any_unimported = true [mypy-indico.*] ignore_missing_imports = True From 6acf68c892b749516d53d2be6ffa18795f68688a Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Tue, 1 Apr 2025 08:34:28 -0500 Subject: [PATCH 03/14] Unignore type hinted modules --- mypy.ini | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mypy.ini b/mypy.ini index 582db9bb..8ec91288 100644 --- a/mypy.ini +++ b/mypy.ini @@ -29,9 +29,6 @@ ignore_errors = True [mypy-indico_toolkit.auto_review.*] ignore_errors = True -[mypy-indico_toolkit.etloutput.*] -ignore_errors = True - [mypy-indico_toolkit.indico_wrapper.*] ignore_errors = True @@ -44,12 +41,6 @@ ignore_errors = True [mypy-indico_toolkit.pipelines.*] ignore_errors = True -[mypy-indico_toolkit.polling.*] -ignore_errors = True - -[mypy-indico_toolkit.results.*] -ignore_errors = True - [mypy-indico_toolkit.snapshots.*] ignore_errors = True From 8536eab4b9fad2821028e2fa6c35b81e4ebabeb8 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Tue, 1 Apr 2025 08:37:12 -0500 Subject: [PATCH 04/14] Fix type ignores related to indico-client --- indico_toolkit/client.py | 2 +- indico_toolkit/polling/autoreview.py | 6 +++--- indico_toolkit/polling/downstream.py | 8 ++++---- indico_toolkit/polling/queries.py | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/indico_toolkit/client.py b/indico_toolkit/client.py index 1a4c4cd7..49f69468 100644 --- a/indico_toolkit/client.py +++ b/indico_toolkit/client.py @@ -8,7 +8,7 @@ @retry(IndicoRequestError, ConnectionError) -def create_client( +def create_client( # type: ignore[no-any-unimported] host: str, api_token_path: "str | None" = None, api_token_string: "str | None" = None, diff --git a/indico_toolkit/polling/autoreview.py b/indico_toolkit/polling/autoreview.py index 401a3128..460d077b 100644 --- a/indico_toolkit/polling/autoreview.py +++ b/indico_toolkit/polling/autoreview.py @@ -3,8 +3,8 @@ from dataclasses import dataclass from typing import TYPE_CHECKING -from indico import AsyncIndicoClient, IndicoConfig # type: ignore[import-untyped] -from indico.queries import ( # type: ignore[import-untyped] +from indico import AsyncIndicoClient, IndicoConfig +from indico.queries import ( GetSubmission, JobStatus, RetrieveStorageObject, @@ -41,7 +41,7 @@ class AutoReviewPoller: and submits the review results concurrently. """ - def __init__( + def __init__( # type: ignore[no-any-unimported] self, config: IndicoConfig, workflow_id: int, diff --git a/indico_toolkit/polling/downstream.py b/indico_toolkit/polling/downstream.py index b22b7f15..88506f98 100644 --- a/indico_toolkit/polling/downstream.py +++ b/indico_toolkit/polling/downstream.py @@ -2,12 +2,12 @@ import logging from typing import TYPE_CHECKING -from indico import AsyncIndicoClient, IndicoConfig # type: ignore[import-untyped] -from indico.queries import ( # type: ignore[import-untyped] +from indico import AsyncIndicoClient, IndicoConfig +from indico.queries import ( GetSubmission, UpdateSubmission, ) -from indico.types import Submission # type: ignore[import-untyped] +from indico.types import Submission from ..retry import retry from .queries import SubmissionIdsPendingDownstream @@ -29,7 +29,7 @@ class DownstreamPoller: them concurrently, and marks them as retrieved. """ - def __init__( + def __init__( # type: ignore[no-any-unimported] self, config: IndicoConfig, workflow_id: int, diff --git a/indico_toolkit/polling/queries.py b/indico_toolkit/polling/queries.py index bd629f5c..e67c257e 100644 --- a/indico_toolkit/polling/queries.py +++ b/indico_toolkit/polling/queries.py @@ -1,12 +1,12 @@ from typing import TYPE_CHECKING -from indico.queries import GraphQLRequest # type: ignore[import-untyped] +from indico.queries import GraphQLRequest if TYPE_CHECKING: from typing import Any -class SubmissionIdsPendingAutoReview(GraphQLRequest): # type: ignore[misc] +class SubmissionIdsPendingAutoReview(GraphQLRequest): # type: ignore[misc, no-any-unimported] QUERY = """ query SubmissionIdsPendingAutoReview($workflowIds: [Int]) { submissions( @@ -33,7 +33,7 @@ def process_response(self, response: "Any") -> set[int]: } -class SubmissionIdsPendingDownstream(GraphQLRequest): # type: ignore[misc] +class SubmissionIdsPendingDownstream(GraphQLRequest): # type: ignore[misc, no-any-unimported] QUERY = """ query SubmissionIdsPendingDownstream($workflowIds: [Int]) { submissions( From 2e771b059e638b43abd47699eb565db3f0d9f494 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Tue, 1 Apr 2025 08:38:23 -0500 Subject: [PATCH 05/14] Fix result file test asserts that mypy thinks are unreachable --- tests/results/test_predictions.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/results/test_predictions.py b/tests/results/test_predictions.py index 114329d1..b3b43910 100644 --- a/tests/results/test_predictions.py +++ b/tests/results/test_predictions.py @@ -16,7 +16,7 @@ def test_confidence() -> None: assert prediction.confidence == 1.0 -def test_extractions() -> None: +def test_accepted() -> None: prediction = Extraction( document=None, # type: ignore[arg-type] model=None, # type: ignore[arg-type] @@ -35,6 +35,20 @@ def test_extractions() -> None: prediction.unaccept() assert not prediction.accepted + +def test_rejected() -> None: + prediction = Extraction( + document=None, # type: ignore[arg-type] + model=None, # type: ignore[arg-type] + review=None, + label="Label", + confidences={"Label": 0.5}, + extras=None, # type: ignore[arg-type] + text="Value", + accepted=False, + rejected=False, + ) + prediction.accept() prediction.reject() assert prediction.rejected From 0c726eb11af615ee2b04892185cccc3020f6f784 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 25 Apr 2025 11:09:24 -0500 Subject: [PATCH 06/14] Parse static model results from `component_results` using IPA 7.2 `component_metadata` --- indico_toolkit/results/document.py | 18 ++++--- indico_toolkit/results/predictionlist.py | 29 ++++++++--- indico_toolkit/results/result.py | 65 +++++++++++++++++++++--- 3 files changed, 91 insertions(+), 21 deletions(-) diff --git a/indico_toolkit/results/document.py b/indico_toolkit/results/document.py index c620a218..2e695978 100644 --- a/indico_toolkit/results/document.py +++ b/indico_toolkit/results/document.py @@ -12,13 +12,14 @@ class Document: error: str traceback: str - # Auto review changes must reproduce all model sections that were present in the - # original result file. This may not be possible from the predictions alone--if a - # model had an empty section because it didn't produce predictions or if all of - # the predictions were removed to reject them. As such, the models seen when - # parsing result files are tracked per-document so that the empty sections can be - # reproduced later. + # Auto review changes must reproduce all model and component sections that were + # present in the original result file. This may not be possible from the + # predictions alone--if a model or component had an empty section because it didn't + # produce predictions or if all of the predictions for that section were dropped. + # As such, the models and components seen when parsing a result file are tracked + # per-document so that the empty sections can be reproduced later. _model_sections: "frozenset[str]" + _component_sections: "frozenset[str]" @staticmethod def from_v1_dict(result: object) -> "Document": @@ -38,6 +39,7 @@ def from_v1_dict(result: object) -> "Document": error="", traceback="", _model_sections=model_names, + _component_sections=frozenset(), ) @staticmethod @@ -46,7 +48,9 @@ def from_v3_dict(document: object) -> "Document": Create a `Document` from a v3 document dictionary. """ model_results = get(document, dict, "model_results", "ORIGINAL") + component_results = get(document, dict, "component_results", "ORIGINAL") model_ids = frozenset(model_results.keys()) + component_ids = frozenset(component_results.keys()) etl_output_uri = get(document, str, "etl_output") return Document( @@ -57,6 +61,7 @@ def from_v3_dict(document: object) -> "Document": error="", traceback="", _model_sections=model_ids, + _component_sections=component_ids, ) @staticmethod @@ -75,4 +80,5 @@ def from_v3_errored_file(errored_file: object) -> "Document": error=error, traceback=traceback, _model_sections=frozenset(), + _component_sections=frozenset(), ) diff --git a/indico_toolkit/results/predictionlist.py b/indico_toolkit/results/predictionlist.py index 755c0c7a..cbea64f8 100644 --- a/indico_toolkit/results/predictionlist.py +++ b/indico_toolkit/results/predictionlist.py @@ -370,13 +370,8 @@ def to_v3_changes(self, documents: "Iterable[Document]") -> "list[dict[str, Any] continue model_results: "dict[str, Any]" = {} - changes.append( - { - "submissionfile_id": document.id, - "model_results": model_results, - "component_results": {}, - } - ) + component_results: "dict[str, Any]" = {} + predictions_by_model = self.where( document=document, ).groupby( @@ -384,12 +379,30 @@ def to_v3_changes(self, documents: "Iterable[Document]") -> "list[dict[str, Any] ) for model, predictions in predictions_by_model.items(): - model_results[str(model.id)] = [ + model_id = str(model.id) + prediction_dicts = [ prediction.to_v3_dict() for prediction in predictions ] + if model_id in document._model_sections: + model_results[model_id] = prediction_dicts + elif model_id in document._component_sections: + component_results[model_id] = prediction_dicts + for model_id in document._model_sections: if model_id not in model_results: model_results[model_id] = [] + for component_id in document._component_sections: + if component_id not in component_results: + component_results[component_id] = [] + + changes.append( + { + "submissionfile_id": document.id, + "model_results": model_results, + "component_results": component_results, + } + ) + return changes diff --git a/indico_toolkit/results/result.py b/indico_toolkit/results/result.py index 4a6c79d3..ceefb5cd 100644 --- a/indico_toolkit/results/result.py +++ b/indico_toolkit/results/result.py @@ -5,12 +5,13 @@ from . import predictions as prediction from .document import Document +from .errors import ResultError from .model import ModelGroup from .normalization import normalize_v1_result, normalize_v3_result from .predictionlist import PredictionList from .predictions import Prediction from .review import Review, ReviewType -from .utils import get +from .utils import get, has if TYPE_CHECKING: from typing import Any @@ -106,16 +107,33 @@ def from_v3_dict(result: object) -> "Result": submission_id = get(result, int, "submission_id") submission_results = get(result, list, "submission_results") modelgroup_metadata = get(result, dict, "modelgroup_metadata") + component_metadata = get(result, dict, "component_metadata") review_metadata = get(result, dict, "reviews") - - processed_documents = map(Document.from_v3_dict, submission_results) errored_files = get(result, dict, "errored_files").values() - failed_documents = map(Document.from_v3_errored_file, errored_files) - documents = sorted(chain(processed_documents, failed_documents)) - models = sorted(map(ModelGroup.from_v3_dict, modelgroup_metadata.values())) - predictions: "PredictionList[Prediction]" = PredictionList() + + static_model_components = filter( + lambda component: ( + get(component, str, "component_type").casefold() == "static_model" + ), + component_metadata.values(), + ) + + documents = sorted( + chain( + map(Document.from_v3_dict, submission_results), + map(Document.from_v3_errored_file, errored_files), + ) + ) + models = sorted( + chain( + map(ModelGroup.from_v3_dict, modelgroup_metadata.values()), + map(ModelGroup.from_v3_dict, static_model_components), + ) + ) reviews = sorted(map(Review.from_dict, review_metadata.values())) + predictions: "PredictionList[Prediction]" = PredictionList() + for document_dict in submission_results: document_id = get(document_dict, int, "submissionfile_id") document = next( @@ -124,11 +142,17 @@ def from_v3_dict(result: object) -> "Result": reviewed_model_predictions: "list[tuple[Review | None, Any]]" = [ (None, get(document_dict, dict, "model_results", "ORIGINAL")) ] + reviewed_component_predictions: "list[tuple[Review | None, Any]]" = [ + (None, get(document_dict, dict, "component_results", "ORIGINAL")) + ] if reviews: reviewed_model_predictions.append( (reviews[-1], get(document_dict, dict, "model_results", "FINAL")) ) + reviewed_component_predictions.append( + (reviews[-1], get(document_dict, dict, "component_results", "FINAL")) # fmt: skip # noqa: E501 + ) for review, model_section in reviewed_model_predictions: for model_id, model_predictions in model_section.items(): @@ -142,6 +166,33 @@ def from_v3_dict(result: object) -> "Result": ) ) + for review, component_section in reviewed_component_predictions: + for component_id, component_predictions in component_section.items(): + try: + model = next( + filter(lambda model: model.id == int(component_id), models) + ) + except StopIteration: + if has(component_metadata, str, component_id, "component_type"): + component_type = get( + component_metadata, str, component_id, "component_type" + ) + raise ResultError( + f"unsupported component type `{component_type!r}` " + f"for component {component_id}" + ) + else: + raise ResultError( + f"no component metadata for component {component_id}" + ) + + predictions.extend( + map( + partial(prediction.from_v3_dict, document, model, review), + component_predictions, + ) + ) + return Result( version=version, submission_id=submission_id, From 6a2d06cc8f0e380913a1043b043ffe66f3c5ced9 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 25 Apr 2025 11:34:12 -0500 Subject: [PATCH 07/14] Include component results in normalization and unit tests --- indico_toolkit/results/normalization.py | 17 +- .../data/results/97211_v3_static_models.json | 1215 +++++++++++++++++ tests/results/test_document.py | 17 +- tests/results/test_predictionlist.py | 1 + 4 files changed, 1246 insertions(+), 4 deletions(-) create mode 100644 tests/data/results/97211_v3_static_models.json diff --git a/indico_toolkit/results/normalization.py b/indico_toolkit/results/normalization.py index 7924f347..b96dff0d 100644 --- a/indico_toolkit/results/normalization.py +++ b/indico_toolkit/results/normalization.py @@ -1,4 +1,5 @@ import re +from itertools import chain from typing import TYPE_CHECKING from .utils import get, has @@ -88,12 +89,18 @@ def normalize_v3_result(result: "Any") -> None: """ task_type_by_model_group_id = { model_group_id: model_group["task_type"] - for model_group_id, model_group in result["modelgroup_metadata"].items() + for model_group_id, model_group in chain( + result["modelgroup_metadata"].items(), + result.get("component_metadata", {}).items(), + ) } predictions_with_task_type: "Iterator[tuple[Any, str]]" = ( - (prediction, task_type_by_model_group_id[model_group_id]) + (prediction, task_type_by_model_group_id.get(model_group_id, "")) for submission_result in get(result, list, "submission_results") - for review_result in get(submission_result, dict, "model_results").values() + for review_result in chain( + get(submission_result, dict, "model_results").values(), + get(submission_result, dict, "component_results").values(), + ) for model_group_id, model_results in review_result.items() for prediction in model_results ) @@ -139,6 +146,10 @@ def normalize_v3_result(result: "Any") -> None: if task_type == "summarization" and "citations" not in prediction: prediction["citations"] = [] + # Prior to 7.2, v3 result files don't include a `component_metadata` section. + if not has(result, dict, "component_metadata"): + result["component_metadata"] = {} + # Prior to 6.8, v3 result files don't include a `reviews` section. if not has(result, dict, "reviews"): result["reviews"] = {} diff --git a/tests/data/results/97211_v3_static_models.json b/tests/data/results/97211_v3_static_models.json new file mode 100644 index 00000000..07c6782e --- /dev/null +++ b/tests/data/results/97211_v3_static_models.json @@ -0,0 +1,1215 @@ +{ + "file_version": 3, + "submission_id": 97211, + "modelgroup_metadata": {}, + "component_metadata": { + "19407": { + "id": 19407, + "name": null, + "component_type": "input_ocr_extraction", + "task_type": null + }, + "19408": { + "id": 19408, + "name": null, + "component_type": "output_json_formatter", + "task_type": null + }, + "19409": { + "id": 19409, + "name": "Accounting Classification", + "component_type": "static_model", + "task_type": "classification" + }, + "19410": { + "id": 19410, + "name": "Agent Link", + "component_type": "link_classification_model", + "task_type": null + }, + "19411": { + "id": 19411, + "name": "Invoice Extraction", + "component_type": "static_model", + "task_type": "annotation" + }, + "19412": { + "id": 19412, + "name": "Purchase Order Extraction", + "component_type": "static_model", + "task_type": "annotation" + }, + "19413": { + "id": 19413, + "name": "Invoice Line Items", + "component_type": "link_label", + "task_type": null + }, + "19414": { + "id": 19414, + "name": "Purchase Order Line Items", + "component_type": "link_label", + "task_type": null + }, + "19415": { + "id": 19415, + "name": "Review", + "component_type": "review", + "task_type": null + }, + "19416": { + "id": 19416, + "name": "Standard Output", + "component_type": "default_output", + "task_type": null + } + }, + "submission_results": [ + { + "submissionfile_id": 93479, + "etl_output": "indico-file:///storage/submission/5588/97211/93479/etl_output.json", + "input_filename": "invoice.pdf", + "input_filepath": "indico-file:///storage/submission/5588/97211/93479.pdf", + "input_filesize": 426157, + "model_results": { "ORIGINAL": {}, "FINAL": {} }, + "component_results": { + "ORIGINAL": { + "19409": [ + { + "field_id": 858117, + "confidence": { + "Invoice": 0.9999999853918985, + "Purchase Order": 1.4608101511772668e-8 + }, + "label": "Invoice" + } + ], + "19411": [ + { + "label": "Vendor Name", + "spans": [{ "start": 0, "end": 13, "page_num": 0 }], + "span_id": "93479:c:19411:idx:0", + "confidence": { + "Invoice Date": 3.3424697676309734e-8, + "Invoice Number": 3.447171437187535e-8, + "Invoice Subtotal": 2.993116865468437e-8, + "Invoice Tax": 3.6883669451981405e-8, + "Invoice Total": 2.7991509554681215e-8, + "Line Item Name": 8.883939806025865e-9, + "Line Item Quantity": 5.827023485949212e-8, + "Line Item Total": 5.176908146609094e-8, + "Vendor Name": 0.9999996423721313 + }, + "field_id": 858126, + "location_type": "exact", + "text": "HubSpot, Inc.", + "groupings": [], + "normalized": { + "text": "HubSpot, Inc.", + "start": 0, + "end": 13, + "structured": null, + "formatted": "HubSpot, Inc.", + "status": "SUCCESS", + "comparison_type": "string", + "comparison_value": "HubSpot, Inc.", + "validation": [ + { + "validation_type": "TYPE_CONVERSION", + "error_message": null, + "validation_status": "SUCCESS" + } + ] + } + }, + { + "label": "Invoice Date", + "spans": [ + { "start": 125, "end": 135, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:1", + "confidence": { + "Invoice Date": 0.9999996423721313, + "Invoice Number": 3.765953948686729e-8, + "Invoice Subtotal": 2.3938278914670263e-8, + "Invoice Tax": 5.890121812512916e-8, + "Invoice Total": 2.9429369163835872e-8, + "Line Item Name": 1.0651284299001418e-7, + "Line Item Quantity": 1.2222901091263338e-7, + "Line Item Total": 4.1870002576160914e-8, + "Vendor Name": 1.4272615089794272e-8 + }, + "field_id": 858119, + "location_type": "exact", + "text": "06/21/2016", + "groupings": [], + "normalized": { + "text": "06/21/2016", + "start": 125, + "end": 135, + "structured": null, + "formatted": "06/21/2016", + "status": "SUCCESS", + "comparison_type": "string", + "comparison_value": "06/21/2016", + "validation": [ + { + "validation_type": "TYPE_CONVERSION", + "error_message": null, + "validation_status": "SUCCESS" + } + ] + } + }, + { + "label": "Invoice Number", + "spans": [ + { "start": 146, "end": 153, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:2", + "confidence": { + "Invoice Date": 1.8282889868714847e-8, + "Invoice Number": 0.9999997019767761, + "Invoice Subtotal": 5.3069218353130054e-8, + "Invoice Tax": 3.253961722293752e-8, + "Invoice Total": 1.3829179579261108e-7, + "Line Item Name": 4.505617923200589e-8, + "Line Item Quantity": 2.9700066406235237e-8, + "Line Item Total": 2.1834006602716727e-8, + "Vendor Name": 5.3663740118281567e-8 + }, + "field_id": 858120, + "location_type": "exact", + "text": "3927578", + "groupings": [], + "normalized": { + "text": "3927578", + "start": 146, + "end": 153, + "structured": null, + "formatted": "3927578", + "status": "SUCCESS", + "comparison_type": "string", + "comparison_value": "3927578", + "validation": [ + { + "validation_type": "TYPE_CONVERSION", + "error_message": null, + "validation_status": "SUCCESS" + } + ] + } + }, + { + "label": "Line Item Name", + "spans": [ + { "start": 340, "end": 407, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:3", + "confidence": { + "Invoice Date": 7.043927041650022e-8, + "Invoice Number": 1.85422432963378e-8, + "Invoice Subtotal": 9.213624529991193e-9, + "Invoice Tax": 8.453332611679798e-8, + "Invoice Total": 1.9014857244314953e-8, + "Line Item Name": 0.9999997019767761, + "Line Item Quantity": 8.187748257171279e-9, + "Line Item Total": 6.951040631975047e-9, + "Vendor Name": 7.4245527059702e-8 + }, + "field_id": 858121, + "location_type": "exact", + "text": "HubSpot Enterprise\nIncluded Contacts\nEnterprise Contacts - Per 1000", + "groupings": [ + { + "group_name": "Invoice Line Item", + "group_index": 4, + "group_id": "19411:Invoice Line Item" + } + ], + "normalized": { + "text": "HubSpot Enterprise\nIncluded Contacts\nEnterprise Contacts - Per 1000", + "start": 340, + "end": 407, + "structured": null, + "formatted": "HubSpot Enterprise\nIncluded Contacts\nEnterprise Contacts - Per 1000", + "status": "SUCCESS", + "comparison_type": "string", + "comparison_value": "HubSpot Enterprise\nIncluded Contacts\nEnterprise Contacts - Per 1000", + "validation": [ + { + "validation_type": "TYPE_CONVERSION", + "error_message": null, + "validation_status": "SUCCESS" + } + ] + } + }, + { + "label": "Line Item Quantity", + "spans": [ + { "start": 476, "end": 477, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:4", + "confidence": { + "Invoice Date": 9.375153098289957e-8, + "Invoice Number": 5.951939741066781e-8, + "Invoice Subtotal": 6.869062474379461e-8, + "Invoice Tax": 1.196322472196698e-7, + "Invoice Total": 7.469050444797176e-8, + "Line Item Name": 6.045668499154999e-8, + "Line Item Quantity": 0.9999992847442627, + "Line Item Total": 1.0597534583212109e-7, + "Vendor Name": 1.2245095604157541e-8 + }, + "field_id": 858122, + "location_type": "exact", + "text": "1", + "groupings": [ + { + "group_name": "Invoice Line Item", + "group_index": 4, + "group_id": "19411:Invoice Line Item" + } + ], + "normalized": { + "text": "1", + "start": 476, + "end": 477, + "structured": null, + "formatted": "1", + "status": "SUCCESS", + "comparison_type": "string", + "comparison_value": "1", + "validation": [ + { + "validation_type": "TYPE_CONVERSION", + "error_message": null, + "validation_status": "SUCCESS" + } + ] + } + }, + { + "label": "Line Item Total", + "spans": [ + { "start": 478, "end": 487, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:5", + "confidence": { + "Invoice Date": 2.9472216667159046e-8, + "Invoice Number": 6.045723477399179e-9, + "Invoice Subtotal": 3.819852167907811e-8, + "Invoice Tax": 6.359238025055447e-9, + "Invoice Total": 1.601269694617713e-8, + "Line Item Name": 4.507643325268873e-8, + "Line Item Quantity": 9.02977035366348e-8, + "Line Item Total": 0.9999998211860657, + "Vendor Name": 3.079120602933472e-8 + }, + "field_id": 858123, + "location_type": "exact", + "text": "$1,200.00", + "groupings": [ + { + "group_name": "Invoice Line Item", + "group_index": 4, + "group_id": "19411:Invoice Line Item" + } + ], + "normalized": { + "text": "$1,200.00", + "start": 478, + "end": 487, + "structured": null, + "formatted": "$1,200.00", + "status": "SUCCESS", + "comparison_type": "string", + "comparison_value": "$1,200.00", + "validation": [ + { + "validation_type": "TYPE_CONVERSION", + "error_message": null, + "validation_status": "SUCCESS" + } + ] + } + }, + { + "label": "Line Item Quantity", + "spans": [ + { "start": 499, "end": 501, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:6", + "confidence": { + "Invoice Date": 1.4731193687111954e-7, + "Invoice Number": 4.397265485067692e-8, + "Invoice Subtotal": 8.046448840559606e-8, + "Invoice Tax": 1.2959037576365517e-7, + "Invoice Total": 5.365328092921118e-8, + "Line Item Name": 4.536295961088399e-8, + "Line Item Quantity": 0.9999994039535522, + "Line Item Total": 6.116934514466266e-8, + "Vendor Name": 1.0360611923942997e-8 + }, + "field_id": 858122, + "location_type": "exact", + "text": "10", + "groupings": [ + { + "group_name": "Invoice Line Item", + "group_index": 5, + "group_id": "19411:Invoice Line Item" + } + ], + "normalized": { + "text": "10", + "start": 499, + "end": 501, + "structured": null, + "formatted": "10", + "status": "SUCCESS", + "comparison_type": "string", + "comparison_value": "10", + "validation": [ + { + "validation_type": "TYPE_CONVERSION", + "error_message": null, + "validation_status": "SUCCESS" + } + ] + } + }, + { + "label": "Line Item Total", + "spans": [ + { "start": 502, "end": 507, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:7", + "confidence": { + "Invoice Date": 2.487784023230688e-8, + "Invoice Number": 5.1960333813383386e-9, + "Invoice Subtotal": 2.7321593876195038e-8, + "Invoice Tax": 5.000234892804656e-9, + "Invoice Total": 1.1822152146123699e-8, + "Line Item Name": 3.007375326546935e-8, + "Line Item Quantity": 8.39153528886527e-8, + "Line Item Total": 0.9999997615814209, + "Vendor Name": 1.9458024524965367e-8 + }, + "field_id": 858123, + "location_type": "exact", + "text": "$0.00", + "groupings": [ + { + "group_name": "Invoice Line Item", + "group_index": 5, + "group_id": "19411:Invoice Line Item" + } + ], + "normalized": { + "text": "$0.00", + "start": 502, + "end": 507, + "structured": null, + "formatted": "$0.00", + "status": "SUCCESS", + "comparison_type": "string", + "comparison_value": "$0.00", + "validation": [ + { + "validation_type": "TYPE_CONVERSION", + "error_message": null, + "validation_status": "SUCCESS" + } + ] + } + }, + { + "label": "Line Item Quantity", + "spans": [ + { "start": 519, "end": 520, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:8", + "confidence": { + "Invoice Date": 1.3724279313009902e-7, + "Invoice Number": 4.43608350053637e-8, + "Invoice Subtotal": 9.689298252624212e-8, + "Invoice Tax": 1.3488433125985466e-7, + "Invoice Total": 5.79847068138406e-8, + "Line Item Name": 4.117256224844823e-8, + "Line Item Quantity": 0.9999994039535522, + "Line Item Total": 8.499782211401907e-8, + "Vendor Name": 9.074271112297083e-9 + }, + "field_id": 858122, + "location_type": "exact", + "text": "5", + "groupings": [ + { + "group_name": "Invoice Line Item", + "group_index": 6, + "group_id": "19411:Invoice Line Item" + } + ], + "normalized": { + "text": "5", + "start": 519, + "end": 520, + "structured": null, + "formatted": "5", + "status": "SUCCESS", + "comparison_type": "string", + "comparison_value": "5", + "validation": [ + { + "validation_type": "TYPE_CONVERSION", + "error_message": null, + "validation_status": "SUCCESS" + } + ] + } + }, + { + "label": "Line Item Total", + "spans": [ + { "start": 521, "end": 527, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:9", + "confidence": { + "Invoice Date": 3.019848193730468e-8, + "Invoice Number": 5.982286666039727e-9, + "Invoice Subtotal": 4.329503866529194e-8, + "Invoice Tax": 5.606381581202413e-9, + "Invoice Total": 1.3499708018116507e-8, + "Line Item Name": 3.2691968243625524e-8, + "Line Item Quantity": 8.082533042852447e-8, + "Line Item Total": 0.9999997615814209, + "Vendor Name": 2.1103359060248295e-8 + }, + "field_id": 858123, + "location_type": "exact", + "text": "$25.00", + "groupings": [ + { + "group_name": "Invoice Line Item", + "group_index": 6, + "group_id": "19411:Invoice Line Item" + } + ], + "normalized": { + "text": "$25.00", + "start": 521, + "end": 527, + "structured": null, + "formatted": "$25.00", + "status": "SUCCESS", + "comparison_type": "string", + "comparison_value": "$25.00", + "validation": [ + { + "validation_type": "TYPE_CONVERSION", + "error_message": null, + "validation_status": "SUCCESS" + } + ] + } + }, + { + "label": "Invoice Subtotal", + "spans": [ + { "start": 537, "end": 546, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:10", + "confidence": { + "Invoice Date": 9.50030809576674e-9, + "Invoice Number": 3.281632743323826e-8, + "Invoice Subtotal": 1.0, + "Invoice Tax": 3.089213507223576e-8, + "Invoice Total": 4.5628176792433806e-9, + "Line Item Name": 4.575837042608555e-9, + "Line Item Quantity": 1.3403760767971562e-8, + "Line Item Total": 3.6726991226032624e-8, + "Vendor Name": 9.200683770416163e-9 + }, + "field_id": 858118, + "location_type": "exact", + "text": "$1,225.00", + "groupings": [], + "normalized": { + "text": "$1,225.00", + "start": 537, + "end": 546, + "structured": null, + "formatted": "$1,225.00", + "status": "SUCCESS", + "comparison_type": "string", + "comparison_value": "$1,225.00", + "validation": [ + { + "validation_type": "TYPE_CONVERSION", + "error_message": null, + "validation_status": "SUCCESS" + } + ] + } + }, + { + "label": "Invoice Tax", + "spans": [ + { "start": 557, "end": 563, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:11", + "confidence": { + "Invoice Date": 6.549097975039331e-8, + "Invoice Number": 1.693945250735851e-8, + "Invoice Subtotal": 4.138750497872934e-8, + "Invoice Tax": 0.9999997615814209, + "Invoice Total": 1.009439642984944e-8, + "Line Item Name": 6.045966927104018e-8, + "Line Item Quantity": 4.567436207025821e-8, + "Line Item Total": 2.1588691723195552e-8, + "Vendor Name": 7.856115757931548e-9 + }, + "field_id": 858125, + "location_type": "exact", + "text": "$76.56", + "groupings": [], + "normalized": { + "text": "$76.56", + "start": 557, + "end": 563, + "structured": null, + "formatted": "$76.56", + "status": "SUCCESS", + "comparison_type": "string", + "comparison_value": "$76.56", + "validation": [ + { + "validation_type": "TYPE_CONVERSION", + "error_message": null, + "validation_status": "SUCCESS" + } + ] + } + }, + { + "label": "Invoice Total", + "spans": [ + { "start": 570, "end": 579, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:12", + "confidence": { + "Invoice Date": 3.388734981513153e-8, + "Invoice Number": 4.117489993404888e-8, + "Invoice Subtotal": 2.353094430418423e-8, + "Invoice Tax": 1.6204319663870592e-8, + "Invoice Total": 0.9999997019767761, + "Line Item Name": 7.487341413536797e-9, + "Line Item Quantity": 4.3006924244082256e-8, + "Line Item Total": 7.838348636823866e-8, + "Vendor Name": 3.8579965888629886e-8 + }, + "field_id": 858124, + "location_type": "exact", + "text": "$1,301.56", + "groupings": [], + "normalized": { + "text": "$1,301.56", + "start": 570, + "end": 579, + "structured": null, + "formatted": "$1,301.56", + "status": "SUCCESS", + "comparison_type": "string", + "comparison_value": "$1,301.56", + "validation": [ + { + "validation_type": "TYPE_CONVERSION", + "error_message": null, + "validation_status": "SUCCESS" + } + ] + } + } + ] + }, + "FINAL": { + "19409": [ + { + "label": "Invoice", + "field_id": 858117, + "confidence": { + "Invoice": 0.9999999853918985, + "Purchase Order": 1.4608101511772668e-8 + } + } + ], + "19411": [ + { + "text": "Updated!", + "label": "Vendor Name", + "spans": [{ "end": 13, "start": 0, "page_num": 0 }], + "span_id": "93479:c:19411:idx:0", + "field_id": 858126, + "groupings": [], + "confidence": { + "Invoice Tax": 3.6883669451981405e-8, + "Vendor Name": 0.9999996423721313, + "Invoice Date": 3.3424697676309734e-8, + "Invoice Total": 2.7991509554681215e-8, + "Invoice Number": 3.447171437187535e-8, + "Line Item Name": 8.883939806025865e-9, + "Line Item Total": 5.176908146609094e-8, + "Invoice Subtotal": 2.993116865468437e-8, + "Line Item Quantity": 5.827023485949212e-8 + }, + "normalized": { + "end": 13, + "text": "Updated!", + "start": 0, + "status": "SUCCESS", + "formatted": "Updated!", + "structured": null, + "validation": [ + { + "error_message": null, + "validation_type": "TYPE_CONVERSION", + "validation_status": "SUCCESS" + } + ], + "comparison_type": "string", + "comparison_value": "HubSpot, Inc." + }, + "location_type": "exact" + }, + { + "text": "Updated!", + "label": "Invoice Date", + "spans": [ + { "end": 135, "start": 125, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:1", + "field_id": 858119, + "groupings": [], + "confidence": { + "Invoice Tax": 5.890121812512916e-8, + "Vendor Name": 1.4272615089794272e-8, + "Invoice Date": 0.9999996423721313, + "Invoice Total": 2.9429369163835872e-8, + "Invoice Number": 3.765953948686729e-8, + "Line Item Name": 1.0651284299001418e-7, + "Line Item Total": 4.1870002576160914e-8, + "Invoice Subtotal": 2.3938278914670263e-8, + "Line Item Quantity": 1.2222901091263338e-7 + }, + "normalized": { + "end": 135, + "text": "Updated!", + "start": 125, + "status": "SUCCESS", + "formatted": "Updated!", + "structured": null, + "validation": [ + { + "error_message": null, + "validation_type": "TYPE_CONVERSION", + "validation_status": "SUCCESS" + } + ], + "comparison_type": "string", + "comparison_value": "06/21/2016" + }, + "location_type": "exact" + }, + { + "text": "Updated!", + "label": "Invoice Number", + "spans": [ + { "end": 153, "start": 146, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:2", + "field_id": 858120, + "groupings": [], + "confidence": { + "Invoice Tax": 3.253961722293752e-8, + "Vendor Name": 5.3663740118281567e-8, + "Invoice Date": 1.8282889868714847e-8, + "Invoice Total": 1.3829179579261108e-7, + "Invoice Number": 0.9999997019767761, + "Line Item Name": 4.505617923200589e-8, + "Line Item Total": 2.1834006602716727e-8, + "Invoice Subtotal": 5.3069218353130054e-8, + "Line Item Quantity": 2.9700066406235237e-8 + }, + "normalized": { + "end": 153, + "text": "Updated!", + "start": 146, + "status": "SUCCESS", + "formatted": "Updated!", + "structured": null, + "validation": [ + { + "error_message": null, + "validation_type": "TYPE_CONVERSION", + "validation_status": "SUCCESS" + } + ], + "comparison_type": "string", + "comparison_value": "3927578" + }, + "location_type": "exact" + }, + { + "text": "Updated!", + "label": "Line Item Name", + "spans": [ + { "end": 407, "start": 340, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:3", + "field_id": 858121, + "groupings": [ + { + "group_id": "19411:Invoice Line Item", + "group_name": "Invoice Line Item", + "group_index": 4 + } + ], + "confidence": { + "Invoice Tax": 8.453332611679798e-8, + "Vendor Name": 7.4245527059702e-8, + "Invoice Date": 7.043927041650022e-8, + "Invoice Total": 1.9014857244314953e-8, + "Invoice Number": 1.85422432963378e-8, + "Line Item Name": 0.9999997019767761, + "Line Item Total": 6.951040631975047e-9, + "Invoice Subtotal": 9.213624529991193e-9, + "Line Item Quantity": 8.187748257171279e-9 + }, + "normalized": { + "end": 407, + "text": "Updated!", + "start": 340, + "status": "SUCCESS", + "formatted": "Updated!", + "structured": null, + "validation": [ + { + "error_message": null, + "validation_type": "TYPE_CONVERSION", + "validation_status": "SUCCESS" + } + ], + "comparison_type": "string", + "comparison_value": "HubSpot Enterprise\nIncluded Contacts\nEnterprise Contacts - Per 1000" + }, + "location_type": "exact" + }, + { + "text": "Updated!", + "label": "Line Item Quantity", + "spans": [ + { "end": 477, "start": 476, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:4", + "field_id": 858122, + "groupings": [ + { + "group_id": "19411:Invoice Line Item", + "group_name": "Invoice Line Item", + "group_index": 4 + } + ], + "confidence": { + "Invoice Tax": 1.196322472196698e-7, + "Vendor Name": 1.2245095604157541e-8, + "Invoice Date": 9.375153098289957e-8, + "Invoice Total": 7.469050444797176e-8, + "Invoice Number": 5.951939741066781e-8, + "Line Item Name": 6.045668499154999e-8, + "Line Item Total": 1.0597534583212109e-7, + "Invoice Subtotal": 6.869062474379461e-8, + "Line Item Quantity": 0.9999992847442627 + }, + "normalized": { + "end": 477, + "text": "Updated!", + "start": 476, + "status": "SUCCESS", + "formatted": "Updated!", + "structured": null, + "validation": [ + { + "error_message": null, + "validation_type": "TYPE_CONVERSION", + "validation_status": "SUCCESS" + } + ], + "comparison_type": "string", + "comparison_value": "1" + }, + "location_type": "exact" + }, + { + "text": "Updated!", + "label": "Line Item Total", + "spans": [ + { "end": 487, "start": 478, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:5", + "field_id": 858123, + "groupings": [ + { + "group_id": "19411:Invoice Line Item", + "group_name": "Invoice Line Item", + "group_index": 4 + } + ], + "confidence": { + "Invoice Tax": 6.359238025055447e-9, + "Vendor Name": 3.079120602933472e-8, + "Invoice Date": 2.9472216667159046e-8, + "Invoice Total": 1.601269694617713e-8, + "Invoice Number": 6.045723477399179e-9, + "Line Item Name": 4.507643325268873e-8, + "Line Item Total": 0.9999998211860657, + "Invoice Subtotal": 3.819852167907811e-8, + "Line Item Quantity": 9.02977035366348e-8 + }, + "normalized": { + "end": 487, + "text": "Updated!", + "start": 478, + "status": "SUCCESS", + "formatted": "Updated!", + "structured": null, + "validation": [ + { + "error_message": null, + "validation_type": "TYPE_CONVERSION", + "validation_status": "SUCCESS" + } + ], + "comparison_type": "string", + "comparison_value": "$1,200.00" + }, + "location_type": "exact" + }, + { + "text": "Updated!", + "label": "Line Item Quantity", + "spans": [ + { "end": 501, "start": 499, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:6", + "field_id": 858122, + "groupings": [ + { + "group_id": "19411:Invoice Line Item", + "group_name": "Invoice Line Item", + "group_index": 5 + } + ], + "confidence": { + "Invoice Tax": 1.2959037576365517e-7, + "Vendor Name": 1.0360611923942997e-8, + "Invoice Date": 1.4731193687111954e-7, + "Invoice Total": 5.365328092921118e-8, + "Invoice Number": 4.397265485067692e-8, + "Line Item Name": 4.536295961088399e-8, + "Line Item Total": 6.116934514466266e-8, + "Invoice Subtotal": 8.046448840559606e-8, + "Line Item Quantity": 0.9999994039535522 + }, + "normalized": { + "end": 501, + "text": "Updated!", + "start": 499, + "status": "SUCCESS", + "formatted": "Updated!", + "structured": null, + "validation": [ + { + "error_message": null, + "validation_type": "TYPE_CONVERSION", + "validation_status": "SUCCESS" + } + ], + "comparison_type": "string", + "comparison_value": "10" + }, + "location_type": "exact" + }, + { + "text": "Updated!", + "label": "Line Item Total", + "spans": [ + { "end": 507, "start": 502, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:7", + "field_id": 858123, + "groupings": [ + { + "group_id": "19411:Invoice Line Item", + "group_name": "Invoice Line Item", + "group_index": 5 + } + ], + "confidence": { + "Invoice Tax": 5.000234892804656e-9, + "Vendor Name": 1.9458024524965367e-8, + "Invoice Date": 2.487784023230688e-8, + "Invoice Total": 1.1822152146123699e-8, + "Invoice Number": 5.1960333813383386e-9, + "Line Item Name": 3.007375326546935e-8, + "Line Item Total": 0.9999997615814209, + "Invoice Subtotal": 2.7321593876195038e-8, + "Line Item Quantity": 8.39153528886527e-8 + }, + "normalized": { + "end": 507, + "text": "Updated!", + "start": 502, + "status": "SUCCESS", + "formatted": "Updated!", + "structured": null, + "validation": [ + { + "error_message": null, + "validation_type": "TYPE_CONVERSION", + "validation_status": "SUCCESS" + } + ], + "comparison_type": "string", + "comparison_value": "$0.00" + }, + "location_type": "exact" + }, + { + "text": "Updated!", + "label": "Line Item Quantity", + "spans": [ + { "end": 520, "start": 519, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:8", + "field_id": 858122, + "groupings": [ + { + "group_id": "19411:Invoice Line Item", + "group_name": "Invoice Line Item", + "group_index": 6 + } + ], + "confidence": { + "Invoice Tax": 1.3488433125985466e-7, + "Vendor Name": 9.074271112297083e-9, + "Invoice Date": 1.3724279313009902e-7, + "Invoice Total": 5.79847068138406e-8, + "Invoice Number": 4.43608350053637e-8, + "Line Item Name": 4.117256224844823e-8, + "Line Item Total": 8.499782211401907e-8, + "Invoice Subtotal": 9.689298252624212e-8, + "Line Item Quantity": 0.9999994039535522 + }, + "normalized": { + "end": 520, + "text": "Updated!", + "start": 519, + "status": "SUCCESS", + "formatted": "Updated!", + "structured": null, + "validation": [ + { + "error_message": null, + "validation_type": "TYPE_CONVERSION", + "validation_status": "SUCCESS" + } + ], + "comparison_type": "string", + "comparison_value": "5" + }, + "location_type": "exact" + }, + { + "text": "Updated!", + "label": "Line Item Total", + "spans": [ + { "end": 527, "start": 521, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:9", + "field_id": 858123, + "groupings": [ + { + "group_id": "19411:Invoice Line Item", + "group_name": "Invoice Line Item", + "group_index": 6 + } + ], + "confidence": { + "Invoice Tax": 5.606381581202413e-9, + "Vendor Name": 2.1103359060248295e-8, + "Invoice Date": 3.019848193730468e-8, + "Invoice Total": 1.3499708018116507e-8, + "Invoice Number": 5.982286666039727e-9, + "Line Item Name": 3.2691968243625524e-8, + "Line Item Total": 0.9999997615814209, + "Invoice Subtotal": 4.329503866529194e-8, + "Line Item Quantity": 8.082533042852447e-8 + }, + "normalized": { + "end": 527, + "text": "Updated!", + "start": 521, + "status": "SUCCESS", + "formatted": "Updated!", + "structured": null, + "validation": [ + { + "error_message": null, + "validation_type": "TYPE_CONVERSION", + "validation_status": "SUCCESS" + } + ], + "comparison_type": "string", + "comparison_value": "$25.00" + }, + "location_type": "exact" + }, + { + "text": "Updated!", + "label": "Invoice Subtotal", + "spans": [ + { "end": 546, "start": 537, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:10", + "field_id": 858118, + "groupings": [], + "confidence": { + "Invoice Tax": 3.089213507223576e-8, + "Vendor Name": 9.200683770416163e-9, + "Invoice Date": 9.50030809576674e-9, + "Invoice Total": 4.5628176792433806e-9, + "Invoice Number": 3.281632743323826e-8, + "Line Item Name": 4.575837042608555e-9, + "Line Item Total": 3.6726991226032624e-8, + "Invoice Subtotal": 1.0, + "Line Item Quantity": 1.3403760767971562e-8 + }, + "normalized": { + "end": 546, + "text": "Updated!", + "start": 537, + "status": "SUCCESS", + "formatted": "Updated!", + "structured": null, + "validation": [ + { + "error_message": null, + "validation_type": "TYPE_CONVERSION", + "validation_status": "SUCCESS" + } + ], + "comparison_type": "string", + "comparison_value": "$1,225.00" + }, + "location_type": "exact" + }, + { + "text": "Updated!", + "label": "Invoice Tax", + "spans": [ + { "end": 563, "start": 557, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:11", + "field_id": 858125, + "groupings": [], + "confidence": { + "Invoice Tax": 0.9999997615814209, + "Vendor Name": 7.856115757931548e-9, + "Invoice Date": 6.549097975039331e-8, + "Invoice Total": 1.009439642984944e-8, + "Invoice Number": 1.693945250735851e-8, + "Line Item Name": 6.045966927104018e-8, + "Line Item Total": 2.1588691723195552e-8, + "Invoice Subtotal": 4.138750497872934e-8, + "Line Item Quantity": 4.567436207025821e-8 + }, + "normalized": { + "end": 563, + "text": "Updated!", + "start": 557, + "status": "SUCCESS", + "formatted": "Updated!", + "structured": null, + "validation": [ + { + "error_message": null, + "validation_type": "TYPE_CONVERSION", + "validation_status": "SUCCESS" + } + ], + "comparison_type": "string", + "comparison_value": "$76.56" + }, + "location_type": "exact" + }, + { + "text": "Updated!", + "label": "Invoice Total", + "spans": [ + { "end": 579, "start": 570, "page_num": 0 } + ], + "span_id": "93479:c:19411:idx:12", + "field_id": 858124, + "groupings": [], + "confidence": { + "Invoice Tax": 1.6204319663870592e-8, + "Vendor Name": 3.8579965888629886e-8, + "Invoice Date": 3.388734981513153e-8, + "Invoice Total": 0.9999997019767761, + "Invoice Number": 4.117489993404888e-8, + "Line Item Name": 7.487341413536797e-9, + "Line Item Total": 7.838348636823866e-8, + "Invoice Subtotal": 2.353094430418423e-8, + "Line Item Quantity": 4.3006924244082256e-8 + }, + "normalized": { + "end": 579, + "text": "Updated!", + "start": 570, + "status": "SUCCESS", + "formatted": "Updated!", + "structured": null, + "validation": [ + { + "error_message": null, + "validation_type": "TYPE_CONVERSION", + "validation_status": "SUCCESS" + } + ], + "comparison_type": "string", + "comparison_value": "$1,301.56" + }, + "location_type": "exact" + } + ] + } + }, + "rejected": { + "models": {}, + "components": { "19409": [], "19411": [] } + } + } + ], + "reviews": { + "69196": { + "review_id": 69196, + "reviewer_id": 422, + "review_notes": null, + "review_rejected": false, + "review_type": "auto" + } + }, + "errored_files": {} +} diff --git a/tests/results/test_document.py b/tests/results/test_document.py index df9862da..6e71855b 100644 --- a/tests/results/test_document.py +++ b/tests/results/test_document.py @@ -40,6 +40,14 @@ def test_empy_v3_sections() -> None: } } }, + "component_metadata": { + "456": { + "id": 456, + "component_type": "static_model", + "task_type": "annotation", + "name": "Empty Model Section" + } + }, "submission_results": [ { "submissionfile_id": 0, @@ -49,6 +57,11 @@ def test_empy_v3_sections() -> None: "ORIGINAL": { "123": [] } + }, + "component_results": { + "ORIGINAL": { + "456": [] + } } } ] @@ -61,6 +74,8 @@ def test_empy_v3_sections() -> None: "model_results": { "123": [], }, - "component_results": {}, + "component_results": { + "456": [], + }, } ] diff --git a/tests/results/test_predictionlist.py b/tests/results/test_predictionlist.py index 5df12291..4f15cfa4 100644 --- a/tests/results/test_predictionlist.py +++ b/tests/results/test_predictionlist.py @@ -27,6 +27,7 @@ def document() -> Document: error="", traceback="", _model_sections=frozenset({"124", "123", "122", "121"}), + _component_sections=frozenset(), ) From 3706eb93a7ddd2676be8acb7c315ed00deb9862f Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 25 Apr 2025 12:02:46 -0500 Subject: [PATCH 08/14] Improve the specificity of errors raised by `results.utils.get()` --- indico_toolkit/results/utils.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/indico_toolkit/results/utils.py b/indico_toolkit/results/utils.py index 3b016bb7..1c65d738 100644 --- a/indico_toolkit/results/utils.py +++ b/indico_toolkit/results/utils.py @@ -1,33 +1,35 @@ from collections.abc import Iterable, Iterator from typing import Callable, TypeVar -from .errors import ResultError - Value = TypeVar("Value") def get(result: object, value_type: "type[Value]", *keys: "str | int") -> Value: """ - Return the value obtained by traversing `result` using `keys` as indices if that - value has type `value_type`. Raise a `ResultError` otherwise. + Return the value of type `value_type` obtained by traversing `result` using `keys`. + Raise an error if a key doesn't exist or the value has the wrong type. """ for key in keys: - if isinstance(key, str) and isinstance(result, dict) and key in result: - result = result[key] - elif isinstance(key, int) and isinstance(result, list) and key < len(result): - result = result[key] + if isinstance(result, dict): + if key in result: + result = result[key] + else: + raise KeyError(f"`{result!r}` does not contain key `{key!r}`") + elif isinstance(result, list): + if isinstance(key, int): + if 0 >= key < len(result): + result = result[key] + else: + raise IndexError(f"`{result!r}` does not contain index `{key!r}`") + else: + TypeError(f"`{result!r}` can not be indexed with `{key!r}`") else: - raise ResultError( - f"result object `{type(result)!r}` does not contain key `{key!r}`" - ) + TypeError(f"`{result!r}` is not a container") if isinstance(result, value_type): return result else: - raise ResultError( - f"result object `{type(result)!r}` does not have a value for " - f"key `{key!r}` of type `{value_type}`" - ) + raise TypeError(f"`{result!r}` is not of type `{value_type}`") def has(result: object, value_type: "type[Value]", *keys: "str | int") -> bool: From cc981d6e443a627e6f6fb8ba59ccc3c3c60f71fe Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 25 Apr 2025 12:04:00 -0500 Subject: [PATCH 09/14] Parse and preserve full spans for classify + unbundle predictions --- .../results/predictions/unbundling.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/indico_toolkit/results/predictions/unbundling.py b/indico_toolkit/results/predictions/unbundling.py index c6d2b81e..c6512325 100644 --- a/indico_toolkit/results/predictions/unbundling.py +++ b/indico_toolkit/results/predictions/unbundling.py @@ -4,6 +4,7 @@ from ..review import Review from ..utils import get, omit from .prediction import Prediction +from .span import Span if TYPE_CHECKING: from typing import Any @@ -14,7 +15,14 @@ @dataclass class Unbundling(Prediction): - pages: "list[int]" + spans: "list[Span]" + + @property + def pages(self) -> "tuple[int, ...]": + """ + Return the pages covered by `self.spans`. + """ + return tuple(span.page for span in self.spans) @staticmethod def from_v3_dict( @@ -32,10 +40,7 @@ def from_v3_dict( review=review, label=get(prediction, str, "label"), confidences=get(prediction, dict, "confidence"), - pages=[ - get(span, int, "page_num") - for span in get(prediction, list, "spans") # fmt: skip - ], + spans=sorted(map(Span.from_dict, get(prediction, list, "spans"))), extras=omit(prediction, "confidence", "label", "spans"), ) @@ -47,5 +52,5 @@ def to_v3_dict(self) -> "dict[str, Any]": **self.extras, "label": self.label, "confidence": self.confidences, - "spans": [{"page_num": page} for page in self.pages], + "spans": [span.to_dict() for span in self.spans], } From 1d9671bf78b9c0f2318101a3584fe4ead0f105b5 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Fri, 25 Apr 2025 12:04:30 -0500 Subject: [PATCH 10/14] Support `group = next(group)` idiom for linked label groups --- indico_toolkit/results/predictions/group.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/indico_toolkit/results/predictions/group.py b/indico_toolkit/results/predictions/group.py index 3fb94960..e9e000d0 100644 --- a/indico_toolkit/results/predictions/group.py +++ b/indico_toolkit/results/predictions/group.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass, replace from typing import TYPE_CHECKING from ..utils import get @@ -13,6 +13,14 @@ class Group: name: str index: int + def __next__(self) -> "Group": + """ + Return the `Group` with the next index. + + Supports `group = next(group)`. + """ + return replace(self, index=self.index + 1) + @staticmethod def from_dict(group: object) -> "Group": return Group( From 3a4bca78490c326b6fabecf163527c1d8c09d487 Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Mon, 5 May 2025 09:01:07 -0500 Subject: [PATCH 11/14] Improve the specificity of errors raised by `results.utils.get()` more Avoid including an entire nested JSON structure when possible: - Only include dict keys instead of the whole dict, - Only include list length instead of the whole list, - Only include the object type that can't be traversed, etc. --- indico_toolkit/results/utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/indico_toolkit/results/utils.py b/indico_toolkit/results/utils.py index 1c65d738..d16ddd27 100644 --- a/indico_toolkit/results/utils.py +++ b/indico_toolkit/results/utils.py @@ -14,22 +14,22 @@ def get(result: object, value_type: "type[Value]", *keys: "str | int") -> Value: if key in result: result = result[key] else: - raise KeyError(f"`{result!r}` does not contain key `{key!r}`") + raise KeyError(f"{key!r} not in {result.keys()!r}") elif isinstance(result, list): if isinstance(key, int): if 0 >= key < len(result): result = result[key] else: - raise IndexError(f"`{result!r}` does not contain index `{key!r}`") + raise IndexError(f"list index {key} out of range {len(result)}") else: - TypeError(f"`{result!r}` can not be indexed with `{key!r}`") + TypeError(f"list cannot be indexed with {key!r}") else: - TypeError(f"`{result!r}` is not a container") + TypeError(f"{type(result)} cannot be traversed") if isinstance(result, value_type): return result else: - raise TypeError(f"`{result!r}` is not of type `{value_type}`") + raise TypeError(f"value `{result!r}` does not have expected type {value_type}") def has(result: object, value_type: "type[Value]", *keys: "str | int") -> bool: From 5712724a74626c22ad93c907698f9227a954914b Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Mon, 5 May 2025 09:14:32 -0500 Subject: [PATCH 12/14] Clean up some error messages --- indico_toolkit/results/__init__.py | 2 +- indico_toolkit/results/predictionlist.py | 2 +- indico_toolkit/results/predictions/__init__.py | 4 ++-- indico_toolkit/results/result.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/indico_toolkit/results/__init__.py b/indico_toolkit/results/__init__.py index 97e6acc2..93872c50 100644 --- a/indico_toolkit/results/__init__.py +++ b/indico_toolkit/results/__init__.py @@ -101,4 +101,4 @@ def _load(result: object) -> Result: elif file_version == 3: return Result.from_v3_dict(result) else: - raise ResultError(f"unsupported file version `{file_version!r}`") + raise ResultError(f"unsupported file version `{file_version}`") diff --git a/indico_toolkit/results/predictionlist.py b/indico_toolkit/results/predictionlist.py index cbea64f8..34db0b62 100644 --- a/indico_toolkit/results/predictionlist.py +++ b/indico_toolkit/results/predictionlist.py @@ -337,7 +337,7 @@ def to_changes(self, result: "Result") -> "Any": elif result.version == 3: return self.to_v3_changes(result.documents) else: - raise ValueError(f"unsupported file version `{result.version!r}`") + raise ValueError(f"unsupported file version `{result.version}`") def to_v1_changes(self, document: "Document") -> "dict[str, Any]": """ diff --git a/indico_toolkit/results/predictions/__init__.py b/indico_toolkit/results/predictions/__init__.py index aa652770..b385c80f 100644 --- a/indico_toolkit/results/predictions/__init__.py +++ b/indico_toolkit/results/predictions/__init__.py @@ -62,7 +62,7 @@ def from_v1_dict( elif model.type == FORM_EXTRACTION: return FormExtraction.from_v1_dict(document, model, review, prediction) else: - raise ResultError(f"unsupported v1 model type `{model.type!r}`") + raise ResultError(f"unsupported v1 model type {model.type!r}") def from_v3_dict( @@ -85,4 +85,4 @@ def from_v3_dict( elif model.type == UNBUNDLING: return Unbundling.from_v3_dict(document, model, review, prediction) else: - raise ResultError(f"unsupported v3 model type `{model.type!r}`") + raise ResultError(f"unsupported v3 model type {model.type!r}") diff --git a/indico_toolkit/results/result.py b/indico_toolkit/results/result.py index ceefb5cd..6325c715 100644 --- a/indico_toolkit/results/result.py +++ b/indico_toolkit/results/result.py @@ -178,7 +178,7 @@ def from_v3_dict(result: object) -> "Result": component_metadata, str, component_id, "component_type" ) raise ResultError( - f"unsupported component type `{component_type!r}` " + f"unsupported component type {component_type!r} " f"for component {component_id}" ) else: From 2f7cc38f7de5e203a43355286cf97cec97a8923b Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Thu, 8 May 2025 11:46:59 -0500 Subject: [PATCH 13/14] Fix errors in `utils.get()` and `utils.has()` --- indico_toolkit/results/utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/indico_toolkit/results/utils.py b/indico_toolkit/results/utils.py index d16ddd27..97f70897 100644 --- a/indico_toolkit/results/utils.py +++ b/indico_toolkit/results/utils.py @@ -17,19 +17,19 @@ def get(result: object, value_type: "type[Value]", *keys: "str | int") -> Value: raise KeyError(f"{key!r} not in {result.keys()!r}") elif isinstance(result, list): if isinstance(key, int): - if 0 >= key < len(result): + if 0 <= key < len(result): result = result[key] else: - raise IndexError(f"list index {key} out of range {len(result)}") + raise IndexError(f"{key} out of range [0,{len(result)})") else: - TypeError(f"list cannot be indexed with {key!r}") + raise TypeError(f"list can't be indexed with {key!r}") else: - TypeError(f"{type(result)} cannot be traversed") + raise TypeError(f"{type(result)} can't be traversed") if isinstance(result, value_type): return result else: - raise TypeError(f"value `{result!r}` does not have expected type {value_type}") + raise TypeError(f"value `{result!r}` doesn't have type {value_type}") def has(result: object, value_type: "type[Value]", *keys: "str | int") -> bool: @@ -37,9 +37,9 @@ def has(result: object, value_type: "type[Value]", *keys: "str | int") -> bool: Check if `result` can be traversed using `keys` to a value of type `value_type`. """ for key in keys: - if isinstance(key, str) and isinstance(result, dict) and key in result: + if isinstance(result, dict) and key in result: result = result[key] - elif isinstance(key, int) and isinstance(result, list) and key < len(result): + elif isinstance(result, list) and isinstance(key, int) and 0 <= key < len(result): # fmt: skip # noqa: E501 result = result[key] else: return False From e7e53449b14535e70a055ecc03f71e30bc69b71b Mon Sep 17 00:00:00 2001 From: Michael Welborn Date: Thu, 8 May 2025 12:03:24 -0500 Subject: [PATCH 14/14] Bump version and update changelog --- CHANGELOG.md | 13 +++++++++++++ indico_toolkit/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e268c7ff..2d89f631 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -153,3 +153,16 @@ This is the first major version release tested to work on Indico 6.X. * Added `etloutput` module. * Refactored `retry` decorator with asyncio support. * Switched to Poetry for packaging and dependency management. + +## 6.14.1 3/20/25 + +* Improved Poetry and Poe configuration. +* Update more attributes when prediction text changes to avoid TAK normalization issues. + +## 6.14.2 5/8/25 + +* Fixed Mypy configuration. +* Removed `AutoPopulator`, `CustomOcr`, `Datasets`, `DocExtraction`, `Reviewer` classes. +* Added support for imported models using IPA 7.2 `component_metadata` section. +* Parse and preserve full span information for `Unbundling` predictions. +* Add `group = next(group)` idiom. diff --git a/indico_toolkit/__init__.py b/indico_toolkit/__init__.py index 6148a3cd..31d417c4 100644 --- a/indico_toolkit/__init__.py +++ b/indico_toolkit/__init__.py @@ -21,4 +21,4 @@ "ToolkitStaggeredLoopError", "ToolkitStatusError", ) -__version__ = "6.14.1" +__version__ = "6.14.2" diff --git a/pyproject.toml b/pyproject.toml index ba174db3..5bb3043a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ authors = [ readme = "README.md" urls = { source = "https://github.com/IndicoDataSolutions/Indico-Solutions-Toolkit" } requires-python = ">=3.10" -version = "6.14.1" +version = "6.14.2" dependencies = ["indico-client (>=6.14.0,<7.0.0)"] [project.optional-dependencies]