From c9e8ee488a7b05b8e1139253a52d82379885ac5e Mon Sep 17 00:00:00 2001 From: Scott Date: Mon, 31 Mar 2025 14:33:33 -0400 Subject: [PATCH] remove doc extraction, review, and dataset classes --- examples/copy_teach_task.py | 20 - .../create_auto_classification_workflow.py | 27 -- examples/dataset_tasks.py | 25 -- examples/merge_snapshots.py | 20 - examples/pdf_highlighter.py | 35 -- examples/submitting_to_doc_extraction.py | 24 -- indico_toolkit/auto_populate/__init__.py | 3 - indico_toolkit/auto_populate/populator.py | 362 ------------------ indico_toolkit/auto_populate/types.py | 66 ---- indico_toolkit/indico_wrapper/__init__.py | 6 - indico_toolkit/indico_wrapper/dataset.py | 114 ------ .../indico_wrapper/doc_extraction.py | 79 ---- indico_toolkit/indico_wrapper/reviewer.py | 111 ------ indico_toolkit/ocr/__init__.py | 2 - indico_toolkit/ocr/customocr_object.py | 42 -- mypy.ini | 3 - tests/integration/conftest.py | 33 +- .../indico_wrapper/test_dataset.py | 40 -- .../indico_wrapper/test_doc_extraction.py | 127 ------ .../indico_wrapper/test_reviewer.py | 60 --- tests/integration/ocr/__init__.py | 0 .../integration/ocr/test_customocr_object.py | 47 --- tests/integration/ocr/test_ondoc_object.py | 63 --- tests/integration/ocr/test_standard_object.py | 33 -- tests/integration/test_populator.py | 80 ---- 25 files changed, 1 insertion(+), 1421 deletions(-) delete mode 100644 examples/copy_teach_task.py delete mode 100644 examples/create_auto_classification_workflow.py delete mode 100644 examples/dataset_tasks.py delete mode 100644 examples/pdf_highlighter.py delete mode 100644 examples/submitting_to_doc_extraction.py delete mode 100644 indico_toolkit/auto_populate/__init__.py delete mode 100644 indico_toolkit/auto_populate/populator.py delete mode 100644 indico_toolkit/auto_populate/types.py delete mode 100644 indico_toolkit/indico_wrapper/dataset.py delete mode 100644 indico_toolkit/indico_wrapper/doc_extraction.py delete mode 100644 indico_toolkit/indico_wrapper/reviewer.py delete mode 100644 indico_toolkit/ocr/customocr_object.py delete mode 100644 tests/integration/indico_wrapper/test_dataset.py delete mode 100644 tests/integration/indico_wrapper/test_doc_extraction.py delete mode 100644 tests/integration/indico_wrapper/test_reviewer.py delete mode 100644 tests/integration/ocr/__init__.py delete mode 100644 tests/integration/ocr/test_customocr_object.py delete mode 100644 tests/integration/ocr/test_ondoc_object.py delete mode 100644 tests/integration/ocr/test_standard_object.py delete mode 100644 tests/integration/test_populator.py diff --git a/examples/copy_teach_task.py b/examples/copy_teach_task.py deleted file mode 100644 index d0b7f459..00000000 --- a/examples/copy_teach_task.py +++ /dev/null @@ -1,20 +0,0 @@ -from indico_toolkit import create_client -from indico_toolkit.auto_populate import AutoPopulator - -""" -Create a new copied Workflow based on given Teach Task Id -and corresponding Dataset Id. -""" - -HOST = "app.indico.io" -API_TOKEN_PATH = "./indico_api_token.txt" -DATASET_ID = 0 -TEACH_TASK_ID = 0 - -client = create_client(HOST, API_TOKEN_PATH) -auto_populator = AutoPopulator(client) -new_workflow = auto_populator.copy_teach_task( - dataset_id=DATASET_ID, - teach_task_id=TEACH_TASK_ID, - workflow_name="Copied Workflow", -) diff --git a/examples/create_auto_classification_workflow.py b/examples/create_auto_classification_workflow.py deleted file mode 100644 index 355adeb4..00000000 --- a/examples/create_auto_classification_workflow.py +++ /dev/null @@ -1,27 +0,0 @@ -from indico_toolkit import create_client -from indico_toolkit.auto_populate import AutoPopulator - -""" -Create an Indico Classification Workflow without any labeling using an organized -directory/folder structure. Each folder/directory should contain only one file type. - -For example, you would target '/base_directory/' if you had your files organized like: - -/base_directory/ -/base_directory/invoices/ -> contains only invoice files -/base_directory/disclosures/ -> contains only disclosure files -""" - -HOST = "app.indico.io" -API_TOKEN_PATH = "./indico_api_token.txt" - -DIRECTORY_FILE_PATH = "./base_directory/" - -client = create_client(HOST, API_TOKEN_PATH) -auto_populator = AutoPopulator(client) -new_workflow = auto_populator.create_auto_classification_workflow( - DIRECTORY_FILE_PATH, - "My dataset", - "My workflow", - "My teach task", -) diff --git a/examples/dataset_tasks.py b/examples/dataset_tasks.py deleted file mode 100644 index a53a2196..00000000 --- a/examples/dataset_tasks.py +++ /dev/null @@ -1,25 +0,0 @@ -from indico_toolkit import create_client -from indico_toolkit.indico_wrapper import Datasets, Download -from indico_toolkit.pipelines import FileProcessing - -DATASET_ID = 1234 -HOST = "app.indico.io" -API_TOKEN_PATH = "./indico_api_token.txt" - -# Instantiate the datasets class -client = create_client(HOST, API_TOKEN_PATH) -datasets = Datasets(client, DATASET_ID) -downloader = Download(client) -""" -Example 1: - -Upload files to an existing dataset in batches -""" -# Collect files to upload -fp = FileProcessing() -fp.get_file_paths_from_dir("./datasets/disclosures/") - -# Upload files to dataset in batches -for paths in fp.batch_files(batch_size=2): - datasets.add_files_to_dataset(paths) - print(f"Uploaded {len(paths)} files") diff --git a/examples/merge_snapshots.py b/examples/merge_snapshots.py index 6b004d3f..d204f206 100644 --- a/examples/merge_snapshots.py +++ b/examples/merge_snapshots.py @@ -1,9 +1,5 @@ -from indico_toolkit import create_client -from indico_toolkit.indico_wrapper import Datasets from indico_toolkit.snapshots import Snapshot -HOST = "app.indico.io" -API_TOKEN_PATH = "./indico_api_token.txt" PATH_TO_SNAPSHOT = "./snapshot_1.csv" PATH_TO_SNAPSHOT_2 = "./snapshot_2.csv" OUTPUT_PATH = "./merged_snapshot_output.csv" @@ -34,19 +30,3 @@ # will now include all of the samples from snap_to_append as well print(main_snap.number_of_samples) main_snap.to_csv(OUTPUT_PATH) - -""" -With that merged snapshot, you can now use the toolkit to upload and train a model. -""" -client = create_client(HOST, API_TOKEN_PATH) -dataset = Datasets(client) -uploaded_dataset = dataset.create_dataset([OUTPUT_PATH], dataset_name="my_dataset") -print(f"My Dataset ID is {uploaded_dataset.id}") -model = dataset.train_model( - uploaded_dataset, - model_name="my_model", - source_col=main_snap.text_col, - target_col=main_snap.label_col, - wait=False, -) -print(f"My Model Group ID is {model.id}") diff --git a/examples/pdf_highlighter.py b/examples/pdf_highlighter.py deleted file mode 100644 index 3540205b..00000000 --- a/examples/pdf_highlighter.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Highlight Indico Extraction Predictions on the source PDF -""" - -from indico_toolkit import create_client -from indico_toolkit.highlighter import Highlighter -from indico_toolkit.indico_wrapper import Workflow - -WORKFLOW_ID = 1418 -HOST = "app.indico.io" -API_TOKEN_PATH = "./indico_api_token.txt" -PATH_TO_DOCUMENT = "./mydocument.pdf" -# Instantiate the workflow class -client = create_client(HOST, API_TOKEN_PATH) -wflow = Workflow(client) - -# Get predictions and ondocument OCR object -submission_ids = wflow.submit_documents_to_workflow(WORKFLOW_ID, [PATH_TO_DOCUMENT]) -submission_result = wflow.get_submission_results_from_ids(submission_ids)[0] -ocr_object = wflow.get_ondoc_ocr_from_etl_url(submission_result.etl_url) - -# Highlight Predictions onto source document and write it to disc -highlighter = Highlighter(submission_result.predictions, PATH_TO_DOCUMENT) -highlighter.collect_tokens(ocr_object.token_objects) -highlighter.highlight_pdf("./highlighted_doc.pdf", ocr_object.page_heights_and_widths) - -# You can also have unique color highlights for each label group, write the label above -# the highlight, and add bookmarks of what labels appear on which pages -highlighter.highlight_pdf( - "./highlighted_doc.pdf", - ocr_object.page_heights_and_widths, - all_yellow_highlight=False, - add_label_annotations=True, - add_bookmarks=True, -) diff --git a/examples/submitting_to_doc_extraction.py b/examples/submitting_to_doc_extraction.py deleted file mode 100644 index 2bc20cc2..00000000 --- a/examples/submitting_to_doc_extraction.py +++ /dev/null @@ -1,24 +0,0 @@ -from indico_toolkit import create_client -from indico_toolkit.indico_wrapper import DocExtraction -from indico_toolkit.pipelines import FileProcessing - -""" -Retrieves a list of raw full document texts for all files in a folder -""" - -HOST = "app.indico.io" -API_TOKEN_PATH = "./indico_api_token.txt" - -# Instantiate the doc_extraction class -client = create_client(HOST, API_TOKEN_PATH) -doc_extraction = DocExtraction(client=client, preset_config="ondocument") - -# Collect files to submit -fp = FileProcessing() -fp.get_file_paths_from_dir("./datasets/disclosures/") - -# Submit documents with optional text setting and save results to variable -doc_texts = [] -for paths in fp.batch_files(batch_size=10): - doc_texts.append(doc_extraction.run_ocr(filepaths=paths, text_setting="full_text")) -print(doc_texts) diff --git a/indico_toolkit/auto_populate/__init__.py b/indico_toolkit/auto_populate/__init__.py deleted file mode 100644 index c3330c10..00000000 --- a/indico_toolkit/auto_populate/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .populator import AutoPopulator - -__all__ = ("AutoPopulator",) diff --git a/indico_toolkit/auto_populate/populator.py b/indico_toolkit/auto_populate/populator.py deleted file mode 100644 index 9b2e6396..00000000 --- a/indico_toolkit/auto_populate/populator.py +++ /dev/null @@ -1,362 +0,0 @@ -import dataclasses -import time -from json import loads -from os import PathLike -from pathlib import Path -from typing import Dict, List, Tuple, Union - -from indico import IndicoClient -from indico.queries import ( - CreateExport, - DownloadExport, - GetDataset, - GetModelGroup, -) -from indico.types import Workflow - -from ..errors import ToolkitPopulationError -from ..structure.create_structure import Structure -from .types import ( - Example, - ExampleList, - LabelInput, - LabelInst, - TokenSpanInput, -) - - -class AutoPopulator: - def __init__(self, client: IndicoClient): - """ - Module for manipulating and creating new workflows and teach tasks. - - Args: - client (IndicoClient): instantiated Indico Client - """ - self.client = client - self.structure = Structure(client) - self._exceptions = [] - - def create_auto_classification_workflow( - self, - directory_path: Union[str, PathLike[str]], - dataset_name: str, - workflow_name: str, - teach_task_name: str, - accepted_types: Tuple[str, ...] = ( - "csv", - "doc", - "docx", - "eml", - "jpeg", - "jpg", - "msg", - "pdf", - "png", - "pptx", - "rtf", - "svg", - "tif", - "tiff", - "txt", - "xls", - "xlsx", - ), - ) -> Workflow: - """ - Label and train a model based on a directory structure or existing teach task. - You should have a base directory containing sub directories where each - directory contains a unique file type and only that file type. - - Example: - base_directory/ - base_directory/invoices/ -> folder containing only invoices - base_directory/disclosures/ -> folder containing only disclosures - etc. etc. - Args: - directory_path (str): Path to a directory containing your filepath structure - dataset_name (str): Name of created dataset - worlflow_name (str): Name of created workflow - teach_task_name (str): Name of created teach task - accepted_types (Tuple[str], optional): List of accepted file types to search - Returns: - Workflow: a Workflow object representation of the newly created workflow - """ - - def valid_file(file: Path) -> bool: - return ( - file.is_file() and file.suffix.strip(".").casefold() in accepted_types - ) - - folder = Path(directory_path) - files = list(filter(valid_file, folder.glob("*/*"))) - classes = list(set(file.parent.name for file in files)) - labeled_files = {file.name: [{"label": file.parent.name}] for file in files} - - if len(classes) < 2: - raise ToolkitPopulationError( - "You must have documents in at least 2 directories, " - f"you only have {len(classes)}" - ) - - # Upload files to a new dataset. - dataset = self.structure.create_dataset( - dataset_name=dataset_name, - files_to_upload=files, - read_api=True, - single_column=False, - auto_rotate=False, - upscale_images=True, - languages=["ENG"], - ) - - # Create a new workflow with classification model. - workflow = self.structure.create_workflow(workflow_name, dataset.id) - workflow = self.structure.add_teach_task( - task_name=teach_task_name, - labelset_name=f"{teach_task_name}_labelset", - target_names=classes, - dataset_id=dataset.id, - workflow_id=workflow.id, - model_type="classification", - ) - teach_task_id = workflow.components[-1].model_group.questionnaire_id - labelset_id, model_group_id, label_map = self._get_teach_task_details( - teach_task_id - ) - - labels = self.get_labels_by_filename(model_group_id, labeled_files, label_map) - self.structure.label_teach_task( - label_set_id=labelset_id, - labels=list(map(dataclasses.asdict, labels)), - model_group_id=model_group_id, - ) - - return workflow - - def copy_teach_task( - self, - dataset_id: int, - teach_task_id: int, - workflow_name: str, - data_column: str = "document", - rename_labels: Dict[str, str] = None, - remove_labels: List[str] = None, - ) -> Workflow: - """ - Create duplicate teach task in same Indico platform. - - Note: Does not work with datasets created with a snapshot - - Args: - dataset_id (int): The dataset id of the dataset you wish to copy - teach_task_id (int): The teach task id of the corresponding teach task to - the dataset - workflow_name (string): The name of the newly created workflow - data_column_id (str, optional): The datacolumn id of the corresponding - dataset. Defaults to 'document' - rename_labels (dict, optional): Dictionary in format - {old_label_name : new_label_name} - remove_labels (list, optional): List of labels to remove from old teach task - - Returns: - Workflow: a Workflow object representation of the newly created workflow - """ - dataset = self.client.call(GetDataset(dataset_id)) - ( - old_labelset_id, - old_model_group_id, - old_target_name_map, - ) = self._get_teach_task_details(teach_task_id=teach_task_id) - # get dataset snapshot - export = self.client.call( - CreateExport(dataset_id=dataset.id, labelset_id=old_labelset_id, wait=True) - ) - csv = self.client.call(DownloadExport(export.id)) - print("Obtained snapshot") - - # create workflow - workflow = self.structure.create_workflow( - name=workflow_name, dataset_id=dataset_id - ) - time.sleep(2) - print("Created workflow") - old_model_group = self.client.call( - GetModelGroup(id=old_model_group_id, wait=True) - ) - model_type = old_model_group.task_type.lower() - # Create new teach task - workflow = self.structure.add_teach_task( - task_name=workflow_name, - labelset_name=workflow_name, - target_names=list(old_target_name_map.keys()), - dataset_id=dataset.id, - workflow_id=workflow.id, - model_type=model_type, - data_column=data_column, - ) - ( - new_labelset_id, - new_model_group_id, - new_target_name_map, - ) = self._get_teach_task_details( - workflow.components[-1].model_group.questionnaire_id - ) - # Get file_to_targets from export CSV - file_to_targets = {} - for _, row in csv.iterrows(): - # Check for NaN filled rows - if isinstance(row[2], float): - continue - old_example_id = row[0] - old_examples = self._get_example_list(old_model_group_id) - targets_list = loads(row[2])["targets"] - file_to_targets[old_examples.get_example(old_example_id).data_file_name] = ( - targets_list - ) - labels = self.get_labels_by_filename( - new_model_group_id, - file_to_targets, - new_target_name_map, - rename_labels, - remove_labels, - ) - # Label new teach task - result = self.structure.label_teach_task( - label_set_id=new_labelset_id, - labels=[dataclasses.asdict(label) for label in labels], - model_group_id=new_model_group_id, - ) - if not result["submitLabelsV2"]["success"]: - raise ToolkitPopulationError("Error: Failed to submit labels") - return workflow - - def inject_labels_into_teach_task( - self, - workflow_id: int, - teach_task_id: int, - file_to_targets: dict, - rename_labels: Dict[str, str] = None, - remove_labels: List[str] = None, - ): - """ - Add label data into existing teach task - - Args: - workflow_id (int): Id of the workflow you wish to add labels to - teach_task_id (int): Id of the corresponding teach task to the workflow - file_to_targets (dict): mapping of filenames to target label data - rename_labels (dict, optional): Dictionary in format - {old_label_name : new_label_name} - remove_labels (list, optional): List of labels to remove from old teach task - """ - ( - labelset_id, - model_group_id, - target_name_map, - ) = self._get_teach_task_details(teach_task_id) - labels = self.get_labels_by_filename( - model_group_id, - file_to_targets, - target_name_map, - rename_labels, - remove_labels, - ) - # Label new teach task - result = self.structure.label_teach_task( - label_set_id=labelset_id, - labels=[dataclasses.asdict(label) for label in labels], - model_group_id=model_group_id, - ) - if not result["submitLabelsV2"]["success"]: - raise ToolkitPopulationError("Error: Failed to submit labels") - - def get_labels_by_filename( - self, - model_group_id: int, - file_to_targets: dict, - target_name_map: dict, - rename_labels: Dict[str, str] = None, - remove_labels: List[str] = None, - ) -> List[LabelInput]: - """ - Args: - model_group_id (int): ID of the model group to be labeled - file_to_targets (dict): mapping in the format {filename : targets_list} - target_name_map (dict): mapping of field name to corresponding target ID - rename_labels (dict, optional): Dictionary in format - {old_label_name : new_label_name} - remove_labels (list, optional): List of labels to remove from old teach task - - Returns: - A list of LabelInput to be ingested by the platform via submitLabelsV2 - """ - labels = [] - # Retrieve examples and match against filename - examples = self._get_example_list(model_group_id) - - for filename, targets_list in file_to_targets.items(): - if rename_labels or remove_labels: - targets_list = self._edit_labels( - targets_list, rename_labels, remove_labels - ) - targets_list = self._convert_label(targets_list, target_name_map) - example_id = examples.get_example_id(filename) - if example_id: - labels.append(LabelInput(example_id, targets_list)) - return labels - - def _edit_labels( - self, - targets_list: List[dict], - rename_labels: Dict[str, str], - remove_labels: List[str], - ): - new_targets_list = [] - for target in targets_list: - if remove_labels and target["label"] not in remove_labels: - if rename_labels and rename_labels.get(target["label"]): - target["label"] = rename_labels[target["label"]] - new_targets_list.append(target) - return new_targets_list - - def _convert_label( - self, targets_list: List[dict], target_name_map: dict - ) -> List[LabelInst]: - updated_labels = [] - for target in targets_list: - updated_label = LabelInst(target_name_map[target["label"]]) - if target.get("spans"): - updated_spans = [ - TokenSpanInput(span["start"], span["end"], span["page_num"]) - for span in target["spans"] - ] - updated_label.spans = updated_spans - updated_labels.append(updated_label) - return updated_labels - - def _get_teach_task_details(self, teach_task_id: int): - teach_task_details = self.structure.get_teach_details( - teach_task_id=teach_task_id - ) - labelset_id = teach_task_details["questionnaire"]["question"]["labelset"]["id"] - model_group_id = teach_task_details["questionnaire"]["question"]["modelGroupId"] - target_names = teach_task_details["questionnaire"]["question"]["labelset"][ - "targetNames" - ] - target_name_map = {} - for target in target_names: - target_name_map[target["name"]] = target["id"] - return labelset_id, model_group_id, target_name_map - - def _get_example_list(self, model_group_id: int, limit=1000): - examples = self.structure.get_example_ids( - model_group_id=model_group_id, limit=limit - ) - examples = ExampleList( - examples=[ - Example(i["id"], i["datafile"]["name"]) - for i in examples["modelGroup"]["pagedExamples"]["examples"] - ] - ) - return examples diff --git a/indico_toolkit/auto_populate/types.py b/indico_toolkit/auto_populate/types.py deleted file mode 100644 index dae2460d..00000000 --- a/indico_toolkit/auto_populate/types.py +++ /dev/null @@ -1,66 +0,0 @@ -from dataclasses import dataclass -from typing import List - - -@dataclass -class Example: - id: int - data_file_name: str - - -class ExampleList: - def __init__(self, examples: List[Example]): - self.examples = examples - - def get_example(self, example_id: int) -> Example: - """ - Returns example with matching example_id. If no matching example id found, - return None. - """ - for example in self.examples: - if example.id == example_id: - return example - return None - - def get_example_id(self, example_data_file_name: str) -> int: - """ - Returns id for a specific example with the same name as example_data_file_name. - If no matching example found, return None. Assumes no duplicate filenames in - dataset - """ - for example in self.examples: - if example.data_file_name == example_data_file_name: - return example.id - return None - - -@dataclass -class TokenSpanInput: - start: int - end: int - pageNum: int - - -@dataclass -class SpatialSpanInput: - top: int - bottom: int - left: int - right: int - pageNum: int - - -@dataclass -class LabelInst: - clsId: int - spans: List[TokenSpanInput] = None - bounds: List[SpatialSpanInput] = None - - -@dataclass -class LabelInput: - exampleId: int - targets: List[LabelInst] - rejected: bool = None - override: bool = None - partial: bool = None diff --git a/indico_toolkit/indico_wrapper/__init__.py b/indico_toolkit/indico_wrapper/__init__.py index 40dd45aa..faa39014 100644 --- a/indico_toolkit/indico_wrapper/__init__.py +++ b/indico_toolkit/indico_wrapper/__init__.py @@ -1,15 +1,9 @@ -from .dataset import Datasets -from .doc_extraction import DocExtraction from .download import Download from .indico_wrapper import IndicoWrapper -from .reviewer import Reviewer from .workflow import Workflow __all__ = ( - "Datasets", - "DocExtraction", "Download", "IndicoWrapper", - "Reviewer", "Workflow", ) diff --git a/indico_toolkit/indico_wrapper/dataset.py b/indico_toolkit/indico_wrapper/dataset.py deleted file mode 100644 index a954d5ab..00000000 --- a/indico_toolkit/indico_wrapper/dataset.py +++ /dev/null @@ -1,114 +0,0 @@ -from typing import List - -from indico import IndicoClient -from indico.queries import ( - AddDataToWorkflow, - AddFiles, - CreateDataset, - CreateEmptyDataset, - DeleteDataset, - GetDataset, -) -from indico.types import Dataset, OcrEngine, Workflow - -from .indico_wrapper import IndicoWrapper - - -class Datasets(IndicoWrapper): - def __init__(self, client: IndicoClient): - self.client = client - - def get_dataset(self, dataset_id: int): - return self.client.call(GetDataset(dataset_id)) - - def add_files_to_dataset(self, dataset_id: int, filepaths: List[str]) -> Dataset: - """ - Upload documents to an existing dataset and wait for them to OCR - """ - dataset = self.client.call( - AddFiles( - dataset_id=dataset_id, files=filepaths, autoprocess=True, wait=True - ) - ) - return dataset - - def add_new_files_to_task(self, workflow_id: id, wait: bool = True) -> Workflow: - """ - Add newly uploaded documents to an existing teach task given the task's - associated workflow ID - - Args: - workflow_id (id): workflow ID associated with teach task - wait (bool, optional): wait for data to be added. Defaults to True. - """ - workflow = self.client.call(AddDataToWorkflow(workflow_id, wait)) - if wait: - print(f"Data added to all teach tasks associated with {workflow.id}") - return workflow - - def create_empty_dataset( - self, - dataset_name: str, - dataset_type: str = "DOCUMENT", - ocr_engine: OcrEngine = OcrEngine.READAPI, - ) -> Dataset: - """ - Create an empty dataset - Args: - name (str): Name of the dataset - dataset_type (str, optional): TEXT, IMAGE, or DOCUMENT. - Defaults to "DOCUMENT". - """ - return self.client.call( - CreateEmptyDataset(dataset_name, dataset_type, ocr_engine) - ) - - def create_dataset( - self, - filepaths: List[str], - dataset_name: str, - ocr_engine: OcrEngine = OcrEngine.READAPI, - ) -> Dataset: - dataset = self.client.call( - CreateDataset( - name=dataset_name, - files=filepaths, - ocr_engine=ocr_engine, - ) - ) - self.dataset_id = dataset.id - return dataset - - def delete_dataset(self, dataset_id: int) -> bool: - """ - Returns True if operation is succesful - """ - return self.client.call(DeleteDataset(id=dataset_id)) - - def get_dataset_metadata(self, dataset_id: int) -> List[dict]: - """ - Get list of dataset files with information like file name, status, and number of - pages - """ - query = """ - query GetDataset($id: Int) { - dataset(id: $id) { - id - name - files { - id - name - numPages - status - } - } - } - """ - dataset = self.graphQL_request( - graphql_query=query, variables={"id": dataset_id} - ) - return dataset["dataset"]["files"] - - def get_col_name_by_id(self, dataset_id: int, col_id: int) -> str: - dataset = self.get_dataset(dataset_id) - return next(c.name for c in dataset.datacolumns if c.id == col_id) diff --git a/indico_toolkit/indico_wrapper/doc_extraction.py b/indico_toolkit/indico_wrapper/doc_extraction.py deleted file mode 100644 index 3a10170d..00000000 --- a/indico_toolkit/indico_wrapper/doc_extraction.py +++ /dev/null @@ -1,79 +0,0 @@ -from typing import List, Union - -from indico import IndicoClient -from indico.queries import DocumentExtraction, Job - -from ..ocr import CustomOcr, OnDoc, StandardOcr -from .indico_wrapper import IndicoWrapper - - -class DocExtraction(IndicoWrapper): - """ - Class to support DocumentExtraction-related API calls - """ - - def __init__( - self, - client: IndicoClient, - preset_config: str = "standard", - custom_config: dict = None, - ): - """ - Args: - preset_config (str): Options are simple, legacy, detailed, ondocument, and - standard. - """ - self._preset_config = preset_config - self.client = client - self.json_config = {"preset_config": preset_config} - if custom_config: - self.json_config = custom_config - - def run_ocr( - self, filepaths: List[str], text_setting: str = None - ) -> List[Union[StandardOcr, OnDoc, CustomOcr, str]]: - """ - Args: - filepaths (List[str]): List of paths to local documents you would like to - submit for extraction - text_setting (str): Options are full_text and page_texts. - - Returns: - extracted_data (List[Union[StandardOcr, OnDoc, CustomOcr, str]]): data from - DocumentExtraction converted to OCR objects or string text - """ - jobs = self._submit_to_ocr(filepaths) - extracted_data = [] - for ind, job in enumerate(jobs): - status = self.get_job_status(job.id, True) - if status.status == "SUCCESS": - result = self.get_storage_object(status.result) - if text_setting == "full_text": - extracted_data.append(self._convert_ocr_objects(result).full_text) - elif text_setting == "page_texts": - extracted_data.append(self._convert_ocr_objects(result).page_texts) - else: - extracted_data.append(self._convert_ocr_objects(result)) - else: - raise RuntimeError( - f"{filepaths[ind]} {status.status}: {status.result}." - ) - return extracted_data - - def _submit_to_ocr(self, filepaths: List[str]) -> List[Job]: - return self.client.call( - DocumentExtraction(files=filepaths, json_config=self.json_config) - ) - - def _convert_ocr_objects( - self, extracted_data: Union[List[dict], dict] - ) -> Union[StandardOcr, OnDoc, CustomOcr]: - if self.json_config == {"preset_config": "ondocument"}: - return OnDoc(extracted_data) - elif ( - self.json_config == {"preset_config": "standard"} - or self.json_config is None - ): - return StandardOcr(extracted_data) - else: - return CustomOcr(extracted_data) diff --git a/indico_toolkit/indico_wrapper/reviewer.py b/indico_toolkit/indico_wrapper/reviewer.py deleted file mode 100644 index faf6e9a4..00000000 --- a/indico_toolkit/indico_wrapper/reviewer.py +++ /dev/null @@ -1,111 +0,0 @@ -import json - -from indico import IndicoClient - -from .indico_wrapper import Workflow - - -class Reviewer(Workflow): - """ - Class to simulate human reviewer - """ - - def __init__( - self, - client: IndicoClient, - workflow_id: int, - ): - self.client = client - self.workflow_id = workflow_id - - def accept_review(self, submission_id: int, changes: dict) -> None: - """ - Accept a submission in the review queue - Args: - submission_id (int): submission ID - changes (dict): accepted predictions with format like, - e.g. {"model_name": [{"label"...}]} - """ - self.graphQL_request( - SUBMIT_REVIEW, - { - "rejected": False, - "submissionId": submission_id, - "changes": json.dumps(changes), - }, - ) - - def get_random_review_id(self): - response = self.graphQL_request( - GET_RANDOM_REVIEW_ID, {"workflowId": self.workflow_id} - ) - try: - return response["randomSubmission"]["id"] - except Exception: - raise RuntimeError("The review queue is empty") - - def get_random_exception_id(self): - response = self.graphQL_request( - GET_RANDOM_EXCEPTION_ID, {"workflowId": self.workflow_id} - ) - try: - return response["randomSubmission"]["id"] - except Exception: - raise RuntimeError("The exception queue is empty") - - def reject_submission(self, submission_id): - return self.graphQL_request( - SUBMIT_REVIEW, {"rejected": True, "submissionId": submission_id} - ) - - -SUBMIT_REVIEW = """ -mutation submitStandardQueue( - $changes: JSONString, - $rejected: Boolean, - $submissionId: Int!, - $notes: String -) { - submitReview( - changes: $changes, - rejected: $rejected, - submissionId: $submissionId, - notes: $notes -) { - id - __typename - } -} -""" - -GET_RANDOM_EXCEPTION_ID = """ -query getExceptionsSubmission($workflowId: Int!) { - randomSubmission(adminReview: true, workflowId: $workflowId) { - id - resultFile - inputFilename - autoReview { - id - changes - __typename - } - __typename - } -} -""" - -GET_RANDOM_REVIEW_ID = """ -query getSubmission($workflowId: Int!) { - randomSubmission(adminReview: false, workflowId: $workflowId) { - id - resultFile - inputFilename - autoReview { - id - changes - __typename - } - __typename - } -} -""" diff --git a/indico_toolkit/ocr/__init__.py b/indico_toolkit/ocr/__init__.py index fbf15bf5..7d728eac 100644 --- a/indico_toolkit/ocr/__init__.py +++ b/indico_toolkit/ocr/__init__.py @@ -1,9 +1,7 @@ -from .customocr_object import CustomOcr from .ondoc_object import OnDoc from .standard_object import StandardOcr __all__ = ( - "CustomOcr", "OnDoc", "StandardOcr", ) diff --git a/indico_toolkit/ocr/customocr_object.py b/indico_toolkit/ocr/customocr_object.py deleted file mode 100644 index 78e1c79c..00000000 --- a/indico_toolkit/ocr/customocr_object.py +++ /dev/null @@ -1,42 +0,0 @@ -from typing import List, Union - - -class CustomOcr: - """ - CustomOcr is a helper class for the raw preset config OCR results. Enables easy - extraction of full text and page-level text. - """ - - def __init__(self, customocr: Union[List[dict], dict]): - """ - customocr Union[List[dict], dict]: result object from - indico.queries.DocumentExtraction - """ - self.customocr = customocr - - @property - def full_text(self) -> str: - """ - Return full document text as string - """ - if isinstance(self.customocr, dict) and "text" in self.customocr: - return self.customocr["text"] - elif isinstance(self.customocr, dict) and "pages" in self.customocr: - if "text" in self.customocr["pages"][0]: - return "\n".join(page["text"] for page in self.customocr["pages"]) - elif isinstance(self.customocr, list) and "pages" in self.customocr[0]: - if "text" in self.customocr[0]["pages"][0]: - return "\n".join(page["pages"][0]["text"] for page in self.customocr) - raise RuntimeError("JSON configuration setting does not have full text.") - - @property - def page_texts(self) -> List[str]: - """ - Return list of page-level text - """ - if isinstance(self.customocr, dict) and "pages" in self.customocr: - return [page["text"] for page in self.customocr["pages"]] - elif isinstance(self.customocr, list) and "pages" in self.customocr[0]: - if "text" in self.customocr[0]["pages"][0]: - return [page["pages"][0]["text"] for page in self.customocr] - raise RuntimeError("JSON configuration setting does not have page-level text.") diff --git a/mypy.ini b/mypy.ini index a205789e..fd98fdd6 100644 --- a/mypy.ini +++ b/mypy.ini @@ -18,9 +18,6 @@ ignore_missing_imports = True [mypy-indico_toolkit.association.*] ignore_errors = True -[mypy-indico_toolkit.auto_populate.*] -ignore_errors = True - [mypy-indico_toolkit.auto_review.*] ignore_errors = True diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index b0c3d356..ec91e109 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -6,14 +6,11 @@ AddModelGroupComponent, CreateDataset, CreateWorkflow, - DocumentExtraction, GetTrainingModelWithProgress, GraphQLRequest, - JobStatus, - RetrieveStorageObject, ) -from indico_toolkit.indico_wrapper import DocExtraction, Workflow +from indico_toolkit.indico_wrapper import Workflow def pytest_addoption(parser: pytest.Parser) -> None: @@ -62,11 +59,6 @@ def dataset_id(dataset): return dataset.id -@pytest.fixture(scope="session") -def doc_extraction_standard(indico_client): - return DocExtraction(indico_client) - - @pytest.fixture(scope="session") def extraction_model_group_id(workflow): return workflow.components[-1].model_group.id @@ -102,18 +94,6 @@ def module_submission_ids(workflow_id, indico_client, pdf_file): return sub_ids -@pytest.fixture(scope="session") -def ondoc_ocr_object(indico_client, pdf_file): - job = indico_client.call( - DocumentExtraction( - files=[pdf_file], json_config={"preset_config": "ondocument"} - ) - ) - job = indico_client.call(JobStatus(id=job[0].id, wait=True)) - extracted_data = indico_client.call(RetrieveStorageObject(job.result)) - return extracted_data - - @pytest.fixture(scope="session") def pdf_file(tests_folder: Path) -> Path: return tests_folder / "data/samples/fin_disc.pdf" @@ -124,17 +104,6 @@ def populator_snapshot_file(tests_folder: Path) -> Path: return tests_folder / "data/snapshots/populator_snapshot.csv" -@pytest.fixture(scope="session") -def standard_ocr_object(indico_client, pdf_file): - # TODO: this can be static-- probably should be "ondoc" as well - job = indico_client.call( - DocumentExtraction(files=[pdf_file], json_config={"preset_config": "standard"}) - ) - job = indico_client.call(JobStatus(id=job[0].id, wait=True)) - extracted_data = indico_client.call(RetrieveStorageObject(job.result)) - return extracted_data - - @pytest.fixture(scope="session") def teach_task_id(workflow): return workflow.components[-1].model_group.questionnaire_id diff --git a/tests/integration/indico_wrapper/test_dataset.py b/tests/integration/indico_wrapper/test_dataset.py deleted file mode 100644 index 2b785071..00000000 --- a/tests/integration/indico_wrapper/test_dataset.py +++ /dev/null @@ -1,40 +0,0 @@ -""" -Test Datasets class methods -""" - -import pytest -from indico.types import Dataset - -from indico_toolkit.indico_wrapper import Datasets - - -@pytest.fixture(scope="module") -def dataset_wrapper(indico_client): - return Datasets(indico_client) - - -def test_get_dataset(dataset_wrapper, dataset_id): - dataset = dataset_wrapper.get_dataset(dataset_id) - assert isinstance(dataset, Dataset) - - -def test_add_to_dataset(dataset_wrapper, dataset_id, pdf_file): - dataset = dataset_wrapper.add_files_to_dataset(dataset_id, filepaths=[pdf_file]) - assert isinstance(dataset, Dataset) - for f in dataset.files: - assert f.status in ["PROCESSED", "FAILED"] - - -def test_get_dataset_files(dataset_wrapper, dataset_id): - files_list = dataset_wrapper.get_dataset_metadata(dataset_id) - assert isinstance(files_list, list) - assert len(files_list) > 0 - - -def test_create_delete_dataset(dataset_wrapper, pdf_file): - dataset = dataset_wrapper.create_dataset( - filepaths=[pdf_file], dataset_name="Toolkit Integration Tests" - ) - assert isinstance(dataset, Dataset) - status = dataset_wrapper.delete_dataset(dataset.id) - assert status diff --git a/tests/integration/indico_wrapper/test_doc_extraction.py b/tests/integration/indico_wrapper/test_doc_extraction.py deleted file mode 100644 index 7917f68d..00000000 --- a/tests/integration/indico_wrapper/test_doc_extraction.py +++ /dev/null @@ -1,127 +0,0 @@ -from indico_toolkit.indico_wrapper import DocExtraction -from indico_toolkit.ocr import OnDoc, StandardOcr - - -def test_run_ocr_ondoc(indico_client, pdf_file): - doc_extraction_ondoc = DocExtraction(indico_client, preset_config="ondocument") - extracted_data = doc_extraction_ondoc.run_ocr(filepaths=[pdf_file]) - for item in extracted_data: - assert isinstance(item, OnDoc) - - -def test_run_ocr_standard(doc_extraction_standard, pdf_file): - extracted_data = doc_extraction_standard.run_ocr(filepaths=[pdf_file]) - for item in extracted_data: - assert isinstance(item, StandardOcr) - - -def test_run_ocr_standard_full_text(doc_extraction_standard, pdf_file): - full_text_result = doc_extraction_standard.run_ocr( - filepaths=[pdf_file], text_setting="full_text" - ) - assert len(full_text_result[0]) == 2062 - - -def test_run_ocr_standard_page_texts(doc_extraction_standard, pdf_file): - page_texts_result = doc_extraction_standard.run_ocr( - filepaths=[pdf_file], text_setting="page_texts" - ) - assert len(page_texts_result[0][0]) == 1153 - - -def test_run_ocr_custom_full_text(indico_client, pdf_file): - doc_extraction_custom = DocExtraction( - indico_client, - custom_config={ - "top_level": "page", - "nest": False, - "reblocking": ["style", "list", "inline-header"], - "pages": [ - "text", - "size", - "dpi", - "doc_offset", - "page_num", - "image", - "thumbnail", - ], - "blocks": [ - "text", - "doc_offset", - "page_offset", - "position", - "block_type", - "page_num", - ], - "tokens": [ - "text", - "doc_offset", - "page_offset", - "block_offset", - "position", - "page_num", - "style", - ], - "chars": [ - "text", - "doc_index", - "block_index", - "page_index", - "page_num", - "position", - ], - }, - ) - full_text_result = doc_extraction_custom.run_ocr( - filepaths=[pdf_file], text_setting="full_text" - ) - assert len(full_text_result[0]) == 2067 - - -def test_run_ocr_custom_page_texts(indico_client, pdf_file): - doc_extraction_custom = DocExtraction( - indico_client, - custom_config={ - "top_level": "page", - "nest": False, - "reblocking": ["style", "list", "inline-header"], - "pages": [ - "text", - "size", - "dpi", - "doc_offset", - "page_num", - "image", - "thumbnail", - ], - "blocks": [ - "text", - "doc_offset", - "page_offset", - "position", - "block_type", - "page_num", - ], - "tokens": [ - "text", - "doc_offset", - "page_offset", - "block_offset", - "position", - "page_num", - "style", - ], - "chars": [ - "text", - "doc_index", - "block_index", - "page_index", - "page_num", - "position", - ], - }, - ) - page_texts_result = doc_extraction_custom.run_ocr( - filepaths=[pdf_file], text_setting="page_texts" - ) - assert len(page_texts_result[0][0]) == 1158 diff --git a/tests/integration/indico_wrapper/test_reviewer.py b/tests/integration/indico_wrapper/test_reviewer.py deleted file mode 100644 index 9b26a4ce..00000000 --- a/tests/integration/indico_wrapper/test_reviewer.py +++ /dev/null @@ -1,60 +0,0 @@ -import pytest - -from indico_toolkit.indico_wrapper import Reviewer, Workflow - - -@pytest.fixture(scope="module") -def submissions_awaiting_review(workflow_id, indico_client, pdf_file): - """ - Ensure that auto review is turned off and there are two submissions "PENDING_REVIEW" - """ - workflow_wrapper = Workflow(indico_client) - workflow_wrapper.update_workflow_settings( - workflow_id, enable_review=True, enable_auto_review=False - ) - sub_ids = workflow_wrapper.submit_documents_to_workflow( - workflow_id, files=[pdf_file, pdf_file] - ) - workflow_wrapper.wait_for_submissions_to_process(sub_ids) - - -def get_change_formatted_predictions(workflow_result): - """ - Helper function for get change format for accepted predictions in test_accept_review - """ - return {workflow_result.model_name: workflow_result.get_predictions.to_list()} - - -@pytest.mark.skip(reason="broken on indico-client>=6.1.0") -def test_accept_review(submissions_awaiting_review, indico_client, workflow_id): - reviewer_wrapper = Reviewer(indico_client, workflow_id) - id_in_review = reviewer_wrapper.get_random_review_id() - submission = reviewer_wrapper.get_submission_object(id_in_review) - assert submission.status == "PENDING_REVIEW" - predictions = reviewer_wrapper.get_submission_results_from_ids([id_in_review]) - changes = get_change_formatted_predictions(predictions[0]) - reviewer_wrapper.accept_review(id_in_review, changes) - submission = reviewer_wrapper.get_submission_object(id_in_review) - assert submission.status == "COMPLETE" - - -@pytest.mark.skip(reason="flaky, depends on submission processing time") -def test_reject_from_review(submissions_awaiting_review, indico_client, workflow_id): - reviewer_wrapper = Reviewer(indico_client, workflow_id) - id_in_review = reviewer_wrapper.get_random_review_id() - reviewer_wrapper.reject_submission(id_in_review) - submission = reviewer_wrapper.get_submission_object(id_in_review) - assert submission.status == "PENDING_ADMIN_REVIEW" - - -@pytest.mark.skip(reason="flaky, depends on submission processing time") -def test_reject_from_admin_review( - submissions_awaiting_review, indico_client, workflow_id -): - reviewer_wrapper = Reviewer(indico_client, workflow_id) - id_in_exception = reviewer_wrapper.get_random_exception_id() - submission = reviewer_wrapper.get_submission_object(id_in_exception) - assert submission.status == "PENDING_ADMIN_REVIEW" - reviewer_wrapper.reject_submission(id_in_exception) - submission = reviewer_wrapper.get_submission_object(id_in_exception) - assert submission.status == "COMPLETE" diff --git a/tests/integration/ocr/__init__.py b/tests/integration/ocr/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/integration/ocr/test_customocr_object.py b/tests/integration/ocr/test_customocr_object.py deleted file mode 100644 index 078ca7f2..00000000 --- a/tests/integration/ocr/test_customocr_object.py +++ /dev/null @@ -1,47 +0,0 @@ -import pytest - -from indico_toolkit.indico_wrapper import DocExtraction - - -def test_full_text(indico_client, pdf_file): - doc_extraction = DocExtraction(indico_client, preset_config="simple") - custom_ocr = doc_extraction.run_ocr(filepaths=[pdf_file]) - assert len(custom_ocr[0].full_text) == 2823 - - -def test_full_text_exception(indico_client, pdf_file): - doc_extraction = DocExtraction( - indico_client, - custom_config={ - "nest": True, - "top_level": "document", - "native_pdf": True, - "blocks": ["text", "position", "doc_offset", "page_offset"], - }, - ) - custom_ocr = doc_extraction.run_ocr(filepaths=[pdf_file]) - with pytest.raises(Exception): - custom_ocr[0].full_text - - -def test_page_texts(indico_client, pdf_file): - doc_extraction = DocExtraction( - indico_client, - custom_config={ - "nest": True, - "top_level": "document", - "native_pdf": True, - "pages": ["text", "size", "dpi", "doc_offset", "page_num", "image"], - "blocks": ["text", "position", "doc_offset", "page_offset"], - }, - ) - custom_ocr = doc_extraction.run_ocr(filepaths=[pdf_file]) - assert isinstance(custom_ocr[0].page_texts, list) - assert isinstance(custom_ocr[0].page_texts[0], str) - - -def test_page_texts_exception(indico_client, pdf_file): - doc_extraction = DocExtraction(indico_client, preset_config="legacy") - custom_ocr = doc_extraction.run_ocr(filepaths=[pdf_file]) - with pytest.raises(Exception): - custom_ocr.page_texts diff --git a/tests/integration/ocr/test_ondoc_object.py b/tests/integration/ocr/test_ondoc_object.py deleted file mode 100644 index e5f49c6b..00000000 --- a/tests/integration/ocr/test_ondoc_object.py +++ /dev/null @@ -1,63 +0,0 @@ -import pytest - -from indico_toolkit.ocr import OnDoc - - -def test_ondoc_full_text(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert len(ondoc_ocr.full_text) == 2067 - - -def test_ondoc_page_texts(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert len(ondoc_ocr.page_texts) == 2 - assert len(ondoc_ocr.page_texts[0]) == 1158 - - -def test_ondoc_page_results(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert len(ondoc_ocr.page_results) == 2 - assert len(ondoc_ocr.page_results[0]) == 8 - - -def test_ondoc_block_texts(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert len(ondoc_ocr.block_texts) == 41 - - -def test_ondoc_token_objects(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert len(ondoc_ocr.token_objects) == 304 - - -def test_ondoc_total_pages(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert ondoc_ocr.total_pages == 2 - - -def test_ondoc_total_characters(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert ondoc_ocr.total_characters == 2067 - - -def test_ondoc_total_tokens(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert ondoc_ocr.total_tokens == 304 - - -def test_ondoc_confidence(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - assert isinstance(ondoc_ocr.ocr_confidence("mean"), float) - assert 1 <= ondoc_ocr.ocr_confidence("mean") <= 100 - - -def test_ondoc_confidence_metric_exception(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object) - with pytest.raises(Exception): - ondoc_ocr.ocr_confidence("average") - - -def test_ondoc_excluded_confidence_exception(ondoc_ocr_object): - ondoc_ocr = OnDoc(ondoc_ocr_object[0]["chars"][0].pop("confidence")) - with pytest.raises(Exception): - ondoc_ocr.ocr_confidence("mean") diff --git a/tests/integration/ocr/test_standard_object.py b/tests/integration/ocr/test_standard_object.py deleted file mode 100644 index 4753b8da..00000000 --- a/tests/integration/ocr/test_standard_object.py +++ /dev/null @@ -1,33 +0,0 @@ -from indico_toolkit.ocr import StandardOcr - - -def test_standard_object_full_text(standard_ocr_object): - standard_ocr = StandardOcr(standard_ocr_object) - assert len(standard_ocr.full_text) == 2062 - - -def test_standard_object_page_texts(standard_ocr_object): - standard_ocr = StandardOcr(standard_ocr_object) - assert len(standard_ocr.page_texts) == 2 - assert len(standard_ocr.page_texts[0]) == 1153 - - -def test_standard_object_page_results(standard_ocr_object): - standard_ocr = StandardOcr(standard_ocr_object) - assert len(standard_ocr.page_results) == 2 - assert len(standard_ocr.page_results[0]) == 4 - - -def test_standard_object_block_texts(standard_ocr_object): - standard_ocr = StandardOcr(standard_ocr_object) - assert len(standard_ocr.block_texts) == 36 - - -def test_standard_object_total_pages(standard_ocr_object): - standard_ocr = StandardOcr(standard_ocr_object) - assert standard_ocr.total_pages == 2 - - -def test_standard_object_total_characters(standard_ocr_object): - standard_ocr = StandardOcr(standard_ocr_object) - assert standard_ocr.total_characters == 2062 diff --git a/tests/integration/test_populator.py b/tests/integration/test_populator.py deleted file mode 100644 index bd2981fd..00000000 --- a/tests/integration/test_populator.py +++ /dev/null @@ -1,80 +0,0 @@ -import json -import os - -import pytest -from indico.queries import GetWorkflow -from indico.types import Workflow - -from indico_toolkit.auto_populate import AutoPopulator -from indico_toolkit.auto_populate.types import LabelInput, LabelInst - -pd = pytest.importorskip("pandas") - - -@pytest.fixture(scope="function") -def static_file_to_targets(populator_snapshot_file): - df = pd.read_csv(populator_snapshot_file) - file_to_targets = {} - for file, target in zip( - df["file_name_1820"].to_list(), df["Toolkit Test Financial Model"].to_list() - ): - if not isinstance(target, float): - file_to_targets[file] = json.loads(target)["targets"] - return file_to_targets - - -def test_create_classification_workflow(indico_client, tests_folder): - auto_populator = AutoPopulator(indico_client) - new_workflow = auto_populator.create_auto_classification_workflow( - os.path.join(tests_folder, "data/auto_class"), - "My dataset", - "My workflow", - "My teach task", - ) - assert isinstance(new_workflow, Workflow) - - -def test_create_classification_workflow_too_few_classes(indico_client, tests_folder): - auto_populator = AutoPopulator(indico_client) - with pytest.raises(Exception): - auto_populator.create_auto_classification_workflow( - os.path.join(tests_folder, "data/auto_class/class_a/"), - "My dataset", - "My workflow", - "My teach task", - ) - - -def test_copy_teach_task(indico_client, dataset, workflow_id, teach_task_id): - auto_populator = AutoPopulator(indico_client) - original_workflow = indico_client.call(GetWorkflow(workflow_id)) - new_workflow = auto_populator.copy_teach_task( - dataset_id=dataset.id, - teach_task_id=teach_task_id, - workflow_name=f"{original_workflow.name}_Copied", - data_column="text", - ) - assert isinstance(new_workflow, Workflow) - - -def test_get_labels_by_filename( - indico_client, - extraction_model_group_id, - teach_task_id, - static_file_to_targets, -): - populator = AutoPopulator(indico_client) - ( - labelset_id, - model_group_id, - target_name_map, - ) = populator._get_teach_task_details(teach_task_id) - - labels = populator.get_labels_by_filename( - extraction_model_group_id, static_file_to_targets, target_name_map - ) - assert len(labels) != 0 - for label in labels: - assert isinstance(label, LabelInput) - for target in label.targets: - assert isinstance(target, LabelInst)