From c9e8ee488a7b05b8e1139253a52d82379885ac5e Mon Sep 17 00:00:00 2001
From: Scott <scott.levin@indico.io>
Date: Mon, 31 Mar 2025 14:33:33 -0400
Subject: [PATCH] remove doc extraction, review, and dataset classes

---
 examples/copy_teach_task.py                   |  20 -
 .../create_auto_classification_workflow.py    |  27 --
 examples/dataset_tasks.py                     |  25 --
 examples/merge_snapshots.py                   |  20 -
 examples/pdf_highlighter.py                   |  35 --
 examples/submitting_to_doc_extraction.py      |  24 --
 indico_toolkit/auto_populate/__init__.py      |   3 -
 indico_toolkit/auto_populate/populator.py     | 362 ------------------
 indico_toolkit/auto_populate/types.py         |  66 ----
 indico_toolkit/indico_wrapper/__init__.py     |   6 -
 indico_toolkit/indico_wrapper/dataset.py      | 114 ------
 .../indico_wrapper/doc_extraction.py          |  79 ----
 indico_toolkit/indico_wrapper/reviewer.py     | 111 ------
 indico_toolkit/ocr/__init__.py                |   2 -
 indico_toolkit/ocr/customocr_object.py        |  42 --
 mypy.ini                                      |   3 -
 tests/integration/conftest.py                 |  33 +-
 .../indico_wrapper/test_dataset.py            |  40 --
 .../indico_wrapper/test_doc_extraction.py     | 127 ------
 .../indico_wrapper/test_reviewer.py           |  60 ---
 tests/integration/ocr/__init__.py             |   0
 .../integration/ocr/test_customocr_object.py  |  47 ---
 tests/integration/ocr/test_ondoc_object.py    |  63 ---
 tests/integration/ocr/test_standard_object.py |  33 --
 tests/integration/test_populator.py           |  80 ----
 25 files changed, 1 insertion(+), 1421 deletions(-)
 delete mode 100644 examples/copy_teach_task.py
 delete mode 100644 examples/create_auto_classification_workflow.py
 delete mode 100644 examples/dataset_tasks.py
 delete mode 100644 examples/pdf_highlighter.py
 delete mode 100644 examples/submitting_to_doc_extraction.py
 delete mode 100644 indico_toolkit/auto_populate/__init__.py
 delete mode 100644 indico_toolkit/auto_populate/populator.py
 delete mode 100644 indico_toolkit/auto_populate/types.py
 delete mode 100644 indico_toolkit/indico_wrapper/dataset.py
 delete mode 100644 indico_toolkit/indico_wrapper/doc_extraction.py
 delete mode 100644 indico_toolkit/indico_wrapper/reviewer.py
 delete mode 100644 indico_toolkit/ocr/customocr_object.py
 delete mode 100644 tests/integration/indico_wrapper/test_dataset.py
 delete mode 100644 tests/integration/indico_wrapper/test_doc_extraction.py
 delete mode 100644 tests/integration/indico_wrapper/test_reviewer.py
 delete mode 100644 tests/integration/ocr/__init__.py
 delete mode 100644 tests/integration/ocr/test_customocr_object.py
 delete mode 100644 tests/integration/ocr/test_ondoc_object.py
 delete mode 100644 tests/integration/ocr/test_standard_object.py
 delete mode 100644 tests/integration/test_populator.py

diff --git a/examples/copy_teach_task.py b/examples/copy_teach_task.py
deleted file mode 100644
index d0b7f459..00000000
--- a/examples/copy_teach_task.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from indico_toolkit import create_client
-from indico_toolkit.auto_populate import AutoPopulator
-
-"""
-Create a new copied Workflow based on given Teach Task Id 
-and corresponding Dataset Id.
-"""
-
-HOST = "app.indico.io"
-API_TOKEN_PATH = "./indico_api_token.txt"
-DATASET_ID = 0
-TEACH_TASK_ID = 0
-
-client = create_client(HOST, API_TOKEN_PATH)
-auto_populator = AutoPopulator(client)
-new_workflow = auto_populator.copy_teach_task(
-    dataset_id=DATASET_ID,
-    teach_task_id=TEACH_TASK_ID,
-    workflow_name="Copied Workflow",
-)
diff --git a/examples/create_auto_classification_workflow.py b/examples/create_auto_classification_workflow.py
deleted file mode 100644
index 355adeb4..00000000
--- a/examples/create_auto_classification_workflow.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from indico_toolkit import create_client
-from indico_toolkit.auto_populate import AutoPopulator
-
-"""
-Create an Indico Classification Workflow without any labeling using an organized
-directory/folder structure. Each folder/directory should contain only one file type.
-
-For example, you would target '/base_directory/' if you had your files organized like:
-
-/base_directory/
-/base_directory/invoices/ -> contains only invoice files
-/base_directory/disclosures/ -> contains only disclosure files
-"""
-
-HOST = "app.indico.io"
-API_TOKEN_PATH = "./indico_api_token.txt"
-
-DIRECTORY_FILE_PATH = "./base_directory/"
-
-client = create_client(HOST, API_TOKEN_PATH)
-auto_populator = AutoPopulator(client)
-new_workflow = auto_populator.create_auto_classification_workflow(
-    DIRECTORY_FILE_PATH,
-    "My dataset",
-    "My workflow",
-    "My teach task",
-)
diff --git a/examples/dataset_tasks.py b/examples/dataset_tasks.py
deleted file mode 100644
index a53a2196..00000000
--- a/examples/dataset_tasks.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from indico_toolkit import create_client
-from indico_toolkit.indico_wrapper import Datasets, Download
-from indico_toolkit.pipelines import FileProcessing
-
-DATASET_ID = 1234
-HOST = "app.indico.io"
-API_TOKEN_PATH = "./indico_api_token.txt"
-
-# Instantiate the datasets class
-client = create_client(HOST, API_TOKEN_PATH)
-datasets = Datasets(client, DATASET_ID)
-downloader = Download(client)
-"""
-Example 1:
-
-Upload files to an existing dataset in batches
-"""
-# Collect files to upload
-fp = FileProcessing()
-fp.get_file_paths_from_dir("./datasets/disclosures/")
-
-# Upload files to dataset in batches
-for paths in fp.batch_files(batch_size=2):
-    datasets.add_files_to_dataset(paths)
-    print(f"Uploaded {len(paths)} files")
diff --git a/examples/merge_snapshots.py b/examples/merge_snapshots.py
index 6b004d3f..d204f206 100644
--- a/examples/merge_snapshots.py
+++ b/examples/merge_snapshots.py
@@ -1,9 +1,5 @@
-from indico_toolkit import create_client
-from indico_toolkit.indico_wrapper import Datasets
 from indico_toolkit.snapshots import Snapshot
 
-HOST = "app.indico.io"
-API_TOKEN_PATH = "./indico_api_token.txt"
 PATH_TO_SNAPSHOT = "./snapshot_1.csv"
 PATH_TO_SNAPSHOT_2 = "./snapshot_2.csv"
 OUTPUT_PATH = "./merged_snapshot_output.csv"
@@ -34,19 +30,3 @@
 # will now include all of the samples from snap_to_append as well
 print(main_snap.number_of_samples)
 main_snap.to_csv(OUTPUT_PATH)
-
-"""
-With that merged snapshot, you can now use the toolkit to upload and train a model.
-"""
-client = create_client(HOST, API_TOKEN_PATH)
-dataset = Datasets(client)
-uploaded_dataset = dataset.create_dataset([OUTPUT_PATH], dataset_name="my_dataset")
-print(f"My Dataset ID is {uploaded_dataset.id}")
-model = dataset.train_model(
-    uploaded_dataset,
-    model_name="my_model",
-    source_col=main_snap.text_col,
-    target_col=main_snap.label_col,
-    wait=False,
-)
-print(f"My Model Group ID is {model.id}")
diff --git a/examples/pdf_highlighter.py b/examples/pdf_highlighter.py
deleted file mode 100644
index 3540205b..00000000
--- a/examples/pdf_highlighter.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""
-Highlight Indico Extraction Predictions on the source PDF
-"""
-
-from indico_toolkit import create_client
-from indico_toolkit.highlighter import Highlighter
-from indico_toolkit.indico_wrapper import Workflow
-
-WORKFLOW_ID = 1418
-HOST = "app.indico.io"
-API_TOKEN_PATH = "./indico_api_token.txt"
-PATH_TO_DOCUMENT = "./mydocument.pdf"
-# Instantiate the workflow class
-client = create_client(HOST, API_TOKEN_PATH)
-wflow = Workflow(client)
-
-# Get predictions and ondocument OCR object
-submission_ids = wflow.submit_documents_to_workflow(WORKFLOW_ID, [PATH_TO_DOCUMENT])
-submission_result = wflow.get_submission_results_from_ids(submission_ids)[0]
-ocr_object = wflow.get_ondoc_ocr_from_etl_url(submission_result.etl_url)
-
-# Highlight Predictions onto source document and write it to disc
-highlighter = Highlighter(submission_result.predictions, PATH_TO_DOCUMENT)
-highlighter.collect_tokens(ocr_object.token_objects)
-highlighter.highlight_pdf("./highlighted_doc.pdf", ocr_object.page_heights_and_widths)
-
-# You can also have unique color highlights for each label group, write the label above
-# the highlight, and add bookmarks of what labels appear on which pages
-highlighter.highlight_pdf(
-    "./highlighted_doc.pdf",
-    ocr_object.page_heights_and_widths,
-    all_yellow_highlight=False,
-    add_label_annotations=True,
-    add_bookmarks=True,
-)
diff --git a/examples/submitting_to_doc_extraction.py b/examples/submitting_to_doc_extraction.py
deleted file mode 100644
index 2bc20cc2..00000000
--- a/examples/submitting_to_doc_extraction.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from indico_toolkit import create_client
-from indico_toolkit.indico_wrapper import DocExtraction
-from indico_toolkit.pipelines import FileProcessing
-
-"""
-Retrieves a list of raw full document texts for all files in a folder
-"""
-
-HOST = "app.indico.io"
-API_TOKEN_PATH = "./indico_api_token.txt"
-
-# Instantiate the doc_extraction class
-client = create_client(HOST, API_TOKEN_PATH)
-doc_extraction = DocExtraction(client=client, preset_config="ondocument")
-
-# Collect files to submit
-fp = FileProcessing()
-fp.get_file_paths_from_dir("./datasets/disclosures/")
-
-# Submit documents with optional text setting and save results to variable
-doc_texts = []
-for paths in fp.batch_files(batch_size=10):
-    doc_texts.append(doc_extraction.run_ocr(filepaths=paths, text_setting="full_text"))
-print(doc_texts)
diff --git a/indico_toolkit/auto_populate/__init__.py b/indico_toolkit/auto_populate/__init__.py
deleted file mode 100644
index c3330c10..00000000
--- a/indico_toolkit/auto_populate/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .populator import AutoPopulator
-
-__all__ = ("AutoPopulator",)
diff --git a/indico_toolkit/auto_populate/populator.py b/indico_toolkit/auto_populate/populator.py
deleted file mode 100644
index 9b2e6396..00000000
--- a/indico_toolkit/auto_populate/populator.py
+++ /dev/null
@@ -1,362 +0,0 @@
-import dataclasses
-import time
-from json import loads
-from os import PathLike
-from pathlib import Path
-from typing import Dict, List, Tuple, Union
-
-from indico import IndicoClient
-from indico.queries import (
-    CreateExport,
-    DownloadExport,
-    GetDataset,
-    GetModelGroup,
-)
-from indico.types import Workflow
-
-from ..errors import ToolkitPopulationError
-from ..structure.create_structure import Structure
-from .types import (
-    Example,
-    ExampleList,
-    LabelInput,
-    LabelInst,
-    TokenSpanInput,
-)
-
-
-class AutoPopulator:
-    def __init__(self, client: IndicoClient):
-        """
-        Module for manipulating and creating new workflows and teach tasks.
-
-        Args:
-            client (IndicoClient): instantiated Indico Client
-        """
-        self.client = client
-        self.structure = Structure(client)
-        self._exceptions = []
-
-    def create_auto_classification_workflow(
-        self,
-        directory_path: Union[str, PathLike[str]],
-        dataset_name: str,
-        workflow_name: str,
-        teach_task_name: str,
-        accepted_types: Tuple[str, ...] = (
-            "csv",
-            "doc",
-            "docx",
-            "eml",
-            "jpeg",
-            "jpg",
-            "msg",
-            "pdf",
-            "png",
-            "pptx",
-            "rtf",
-            "svg",
-            "tif",
-            "tiff",
-            "txt",
-            "xls",
-            "xlsx",
-        ),
-    ) -> Workflow:
-        """
-        Label and train a model based on a directory structure or existing teach task.
-        You should have a base directory containing sub directories where each
-        directory contains a unique file type and only that file type.
-
-        Example:
-            base_directory/
-            base_directory/invoices/ -> folder containing only invoices
-            base_directory/disclosures/ -> folder containing only disclosures
-            etc. etc.
-        Args:
-            directory_path (str): Path to a directory containing your filepath structure
-            dataset_name (str): Name of created dataset
-            worlflow_name (str): Name of created workflow
-            teach_task_name (str): Name of created teach task
-            accepted_types (Tuple[str], optional): List of accepted file types to search
-        Returns:
-            Workflow: a Workflow object representation of the newly created workflow
-        """
-
-        def valid_file(file: Path) -> bool:
-            return (
-                file.is_file() and file.suffix.strip(".").casefold() in accepted_types
-            )
-
-        folder = Path(directory_path)
-        files = list(filter(valid_file, folder.glob("*/*")))
-        classes = list(set(file.parent.name for file in files))
-        labeled_files = {file.name: [{"label": file.parent.name}] for file in files}
-
-        if len(classes) < 2:
-            raise ToolkitPopulationError(
-                "You must have documents in at least 2 directories, "
-                f"you only have {len(classes)}"
-            )
-
-        # Upload files to a new dataset.
-        dataset = self.structure.create_dataset(
-            dataset_name=dataset_name,
-            files_to_upload=files,
-            read_api=True,
-            single_column=False,
-            auto_rotate=False,
-            upscale_images=True,
-            languages=["ENG"],
-        )
-
-        # Create a new workflow with classification model.
-        workflow = self.structure.create_workflow(workflow_name, dataset.id)
-        workflow = self.structure.add_teach_task(
-            task_name=teach_task_name,
-            labelset_name=f"{teach_task_name}_labelset",
-            target_names=classes,
-            dataset_id=dataset.id,
-            workflow_id=workflow.id,
-            model_type="classification",
-        )
-        teach_task_id = workflow.components[-1].model_group.questionnaire_id
-        labelset_id, model_group_id, label_map = self._get_teach_task_details(
-            teach_task_id
-        )
-
-        labels = self.get_labels_by_filename(model_group_id, labeled_files, label_map)
-        self.structure.label_teach_task(
-            label_set_id=labelset_id,
-            labels=list(map(dataclasses.asdict, labels)),
-            model_group_id=model_group_id,
-        )
-
-        return workflow
-
-    def copy_teach_task(
-        self,
-        dataset_id: int,
-        teach_task_id: int,
-        workflow_name: str,
-        data_column: str = "document",
-        rename_labels: Dict[str, str] = None,
-        remove_labels: List[str] = None,
-    ) -> Workflow:
-        """
-        Create duplicate teach task in same Indico platform.
-
-        Note: Does not work with datasets created with a snapshot
-
-        Args:
-            dataset_id (int): The dataset id of the dataset you wish to copy
-            teach_task_id (int): The teach task id of the corresponding teach task to
-                the dataset
-            workflow_name (string): The name of the newly created workflow
-            data_column_id (str, optional): The datacolumn id of the corresponding
-                dataset. Defaults to 'document'
-            rename_labels (dict, optional): Dictionary in format
-                {old_label_name : new_label_name}
-            remove_labels (list, optional): List of labels to remove from old teach task
-
-        Returns:
-            Workflow: a Workflow object representation of the newly created workflow
-        """
-        dataset = self.client.call(GetDataset(dataset_id))
-        (
-            old_labelset_id,
-            old_model_group_id,
-            old_target_name_map,
-        ) = self._get_teach_task_details(teach_task_id=teach_task_id)
-        # get dataset snapshot
-        export = self.client.call(
-            CreateExport(dataset_id=dataset.id, labelset_id=old_labelset_id, wait=True)
-        )
-        csv = self.client.call(DownloadExport(export.id))
-        print("Obtained snapshot")
-
-        # create workflow
-        workflow = self.structure.create_workflow(
-            name=workflow_name, dataset_id=dataset_id
-        )
-        time.sleep(2)
-        print("Created workflow")
-        old_model_group = self.client.call(
-            GetModelGroup(id=old_model_group_id, wait=True)
-        )
-        model_type = old_model_group.task_type.lower()
-        # Create new teach task
-        workflow = self.structure.add_teach_task(
-            task_name=workflow_name,
-            labelset_name=workflow_name,
-            target_names=list(old_target_name_map.keys()),
-            dataset_id=dataset.id,
-            workflow_id=workflow.id,
-            model_type=model_type,
-            data_column=data_column,
-        )
-        (
-            new_labelset_id,
-            new_model_group_id,
-            new_target_name_map,
-        ) = self._get_teach_task_details(
-            workflow.components[-1].model_group.questionnaire_id
-        )
-        # Get file_to_targets from export CSV
-        file_to_targets = {}
-        for _, row in csv.iterrows():
-            # Check for NaN filled rows
-            if isinstance(row[2], float):
-                continue
-            old_example_id = row[0]
-            old_examples = self._get_example_list(old_model_group_id)
-            targets_list = loads(row[2])["targets"]
-            file_to_targets[old_examples.get_example(old_example_id).data_file_name] = (
-                targets_list
-            )
-        labels = self.get_labels_by_filename(
-            new_model_group_id,
-            file_to_targets,
-            new_target_name_map,
-            rename_labels,
-            remove_labels,
-        )
-        # Label new teach task
-        result = self.structure.label_teach_task(
-            label_set_id=new_labelset_id,
-            labels=[dataclasses.asdict(label) for label in labels],
-            model_group_id=new_model_group_id,
-        )
-        if not result["submitLabelsV2"]["success"]:
-            raise ToolkitPopulationError("Error: Failed to submit labels")
-        return workflow
-
-    def inject_labels_into_teach_task(
-        self,
-        workflow_id: int,
-        teach_task_id: int,
-        file_to_targets: dict,
-        rename_labels: Dict[str, str] = None,
-        remove_labels: List[str] = None,
-    ):
-        """
-        Add label data into existing teach task
-
-        Args:
-            workflow_id (int): Id of the workflow you wish to add labels to
-            teach_task_id (int): Id of the corresponding teach task to the workflow
-            file_to_targets (dict): mapping of filenames to target label data
-            rename_labels (dict, optional): Dictionary in format
-                {old_label_name : new_label_name}
-            remove_labels (list, optional): List of labels to remove from old teach task
-        """
-        (
-            labelset_id,
-            model_group_id,
-            target_name_map,
-        ) = self._get_teach_task_details(teach_task_id)
-        labels = self.get_labels_by_filename(
-            model_group_id,
-            file_to_targets,
-            target_name_map,
-            rename_labels,
-            remove_labels,
-        )
-        # Label new teach task
-        result = self.structure.label_teach_task(
-            label_set_id=labelset_id,
-            labels=[dataclasses.asdict(label) for label in labels],
-            model_group_id=model_group_id,
-        )
-        if not result["submitLabelsV2"]["success"]:
-            raise ToolkitPopulationError("Error: Failed to submit labels")
-
-    def get_labels_by_filename(
-        self,
-        model_group_id: int,
-        file_to_targets: dict,
-        target_name_map: dict,
-        rename_labels: Dict[str, str] = None,
-        remove_labels: List[str] = None,
-    ) -> List[LabelInput]:
-        """
-        Args:
-            model_group_id (int): ID of the model group to be labeled
-            file_to_targets (dict): mapping in the format {filename : targets_list}
-            target_name_map (dict): mapping of field name to corresponding target ID
-            rename_labels (dict, optional): Dictionary in format
-                {old_label_name : new_label_name}
-            remove_labels (list, optional): List of labels to remove from old teach task
-
-        Returns:
-            A list of LabelInput to be ingested by the platform via submitLabelsV2
-        """
-        labels = []
-        # Retrieve examples and match against filename
-        examples = self._get_example_list(model_group_id)
-
-        for filename, targets_list in file_to_targets.items():
-            if rename_labels or remove_labels:
-                targets_list = self._edit_labels(
-                    targets_list, rename_labels, remove_labels
-                )
-            targets_list = self._convert_label(targets_list, target_name_map)
-            example_id = examples.get_example_id(filename)
-            if example_id:
-                labels.append(LabelInput(example_id, targets_list))
-        return labels
-
-    def _edit_labels(
-        self,
-        targets_list: List[dict],
-        rename_labels: Dict[str, str],
-        remove_labels: List[str],
-    ):
-        new_targets_list = []
-        for target in targets_list:
-            if remove_labels and target["label"] not in remove_labels:
-                if rename_labels and rename_labels.get(target["label"]):
-                    target["label"] = rename_labels[target["label"]]
-                new_targets_list.append(target)
-        return new_targets_list
-
-    def _convert_label(
-        self, targets_list: List[dict], target_name_map: dict
-    ) -> List[LabelInst]:
-        updated_labels = []
-        for target in targets_list:
-            updated_label = LabelInst(target_name_map[target["label"]])
-            if target.get("spans"):
-                updated_spans = [
-                    TokenSpanInput(span["start"], span["end"], span["page_num"])
-                    for span in target["spans"]
-                ]
-                updated_label.spans = updated_spans
-            updated_labels.append(updated_label)
-        return updated_labels
-
-    def _get_teach_task_details(self, teach_task_id: int):
-        teach_task_details = self.structure.get_teach_details(
-            teach_task_id=teach_task_id
-        )
-        labelset_id = teach_task_details["questionnaire"]["question"]["labelset"]["id"]
-        model_group_id = teach_task_details["questionnaire"]["question"]["modelGroupId"]
-        target_names = teach_task_details["questionnaire"]["question"]["labelset"][
-            "targetNames"
-        ]
-        target_name_map = {}
-        for target in target_names:
-            target_name_map[target["name"]] = target["id"]
-        return labelset_id, model_group_id, target_name_map
-
-    def _get_example_list(self, model_group_id: int, limit=1000):
-        examples = self.structure.get_example_ids(
-            model_group_id=model_group_id, limit=limit
-        )
-        examples = ExampleList(
-            examples=[
-                Example(i["id"], i["datafile"]["name"])
-                for i in examples["modelGroup"]["pagedExamples"]["examples"]
-            ]
-        )
-        return examples
diff --git a/indico_toolkit/auto_populate/types.py b/indico_toolkit/auto_populate/types.py
deleted file mode 100644
index dae2460d..00000000
--- a/indico_toolkit/auto_populate/types.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from dataclasses import dataclass
-from typing import List
-
-
-@dataclass
-class Example:
-    id: int
-    data_file_name: str
-
-
-class ExampleList:
-    def __init__(self, examples: List[Example]):
-        self.examples = examples
-
-    def get_example(self, example_id: int) -> Example:
-        """
-        Returns example with matching example_id. If no matching example id found,
-        return None.
-        """
-        for example in self.examples:
-            if example.id == example_id:
-                return example
-        return None
-
-    def get_example_id(self, example_data_file_name: str) -> int:
-        """
-        Returns id for a specific example with the same name as example_data_file_name.
-        If no matching example found, return None. Assumes no duplicate filenames in
-        dataset
-        """
-        for example in self.examples:
-            if example.data_file_name == example_data_file_name:
-                return example.id
-        return None
-
-
-@dataclass
-class TokenSpanInput:
-    start: int
-    end: int
-    pageNum: int
-
-
-@dataclass
-class SpatialSpanInput:
-    top: int
-    bottom: int
-    left: int
-    right: int
-    pageNum: int
-
-
-@dataclass
-class LabelInst:
-    clsId: int
-    spans: List[TokenSpanInput] = None
-    bounds: List[SpatialSpanInput] = None
-
-
-@dataclass
-class LabelInput:
-    exampleId: int
-    targets: List[LabelInst]
-    rejected: bool = None
-    override: bool = None
-    partial: bool = None
diff --git a/indico_toolkit/indico_wrapper/__init__.py b/indico_toolkit/indico_wrapper/__init__.py
index 40dd45aa..faa39014 100644
--- a/indico_toolkit/indico_wrapper/__init__.py
+++ b/indico_toolkit/indico_wrapper/__init__.py
@@ -1,15 +1,9 @@
-from .dataset import Datasets
-from .doc_extraction import DocExtraction
 from .download import Download
 from .indico_wrapper import IndicoWrapper
-from .reviewer import Reviewer
 from .workflow import Workflow
 
 __all__ = (
-    "Datasets",
-    "DocExtraction",
     "Download",
     "IndicoWrapper",
-    "Reviewer",
     "Workflow",
 )
diff --git a/indico_toolkit/indico_wrapper/dataset.py b/indico_toolkit/indico_wrapper/dataset.py
deleted file mode 100644
index a954d5ab..00000000
--- a/indico_toolkit/indico_wrapper/dataset.py
+++ /dev/null
@@ -1,114 +0,0 @@
-from typing import List
-
-from indico import IndicoClient
-from indico.queries import (
-    AddDataToWorkflow,
-    AddFiles,
-    CreateDataset,
-    CreateEmptyDataset,
-    DeleteDataset,
-    GetDataset,
-)
-from indico.types import Dataset, OcrEngine, Workflow
-
-from .indico_wrapper import IndicoWrapper
-
-
-class Datasets(IndicoWrapper):
-    def __init__(self, client: IndicoClient):
-        self.client = client
-
-    def get_dataset(self, dataset_id: int):
-        return self.client.call(GetDataset(dataset_id))
-
-    def add_files_to_dataset(self, dataset_id: int, filepaths: List[str]) -> Dataset:
-        """
-        Upload documents to an existing dataset and wait for them to OCR
-        """
-        dataset = self.client.call(
-            AddFiles(
-                dataset_id=dataset_id, files=filepaths, autoprocess=True, wait=True
-            )
-        )
-        return dataset
-
-    def add_new_files_to_task(self, workflow_id: id, wait: bool = True) -> Workflow:
-        """
-        Add newly uploaded documents to an existing teach task given the task's
-        associated workflow ID
-
-        Args:
-            workflow_id (id): workflow ID associated with teach task
-            wait (bool, optional): wait for data to be added. Defaults to True.
-        """
-        workflow = self.client.call(AddDataToWorkflow(workflow_id, wait))
-        if wait:
-            print(f"Data added to all teach tasks associated with {workflow.id}")
-        return workflow
-
-    def create_empty_dataset(
-        self,
-        dataset_name: str,
-        dataset_type: str = "DOCUMENT",
-        ocr_engine: OcrEngine = OcrEngine.READAPI,
-    ) -> Dataset:
-        """
-        Create an empty dataset
-        Args:
-            name (str): Name of the dataset
-            dataset_type (str, optional): TEXT, IMAGE, or DOCUMENT.
-                Defaults to "DOCUMENT".
-        """
-        return self.client.call(
-            CreateEmptyDataset(dataset_name, dataset_type, ocr_engine)
-        )
-
-    def create_dataset(
-        self,
-        filepaths: List[str],
-        dataset_name: str,
-        ocr_engine: OcrEngine = OcrEngine.READAPI,
-    ) -> Dataset:
-        dataset = self.client.call(
-            CreateDataset(
-                name=dataset_name,
-                files=filepaths,
-                ocr_engine=ocr_engine,
-            )
-        )
-        self.dataset_id = dataset.id
-        return dataset
-
-    def delete_dataset(self, dataset_id: int) -> bool:
-        """
-        Returns True if operation is succesful
-        """
-        return self.client.call(DeleteDataset(id=dataset_id))
-
-    def get_dataset_metadata(self, dataset_id: int) -> List[dict]:
-        """
-        Get list of dataset files with information like file name, status, and number of
-        pages
-        """
-        query = """
-            query GetDataset($id: Int) {
-                dataset(id: $id) {
-                    id
-                    name
-                    files {
-                        id
-                        name
-                        numPages
-                        status
-                    }
-                }
-            }
-        """
-        dataset = self.graphQL_request(
-            graphql_query=query, variables={"id": dataset_id}
-        )
-        return dataset["dataset"]["files"]
-
-    def get_col_name_by_id(self, dataset_id: int, col_id: int) -> str:
-        dataset = self.get_dataset(dataset_id)
-        return next(c.name for c in dataset.datacolumns if c.id == col_id)
diff --git a/indico_toolkit/indico_wrapper/doc_extraction.py b/indico_toolkit/indico_wrapper/doc_extraction.py
deleted file mode 100644
index 3a10170d..00000000
--- a/indico_toolkit/indico_wrapper/doc_extraction.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from typing import List, Union
-
-from indico import IndicoClient
-from indico.queries import DocumentExtraction, Job
-
-from ..ocr import CustomOcr, OnDoc, StandardOcr
-from .indico_wrapper import IndicoWrapper
-
-
-class DocExtraction(IndicoWrapper):
-    """
-    Class to support DocumentExtraction-related API calls
-    """
-
-    def __init__(
-        self,
-        client: IndicoClient,
-        preset_config: str = "standard",
-        custom_config: dict = None,
-    ):
-        """
-        Args:
-            preset_config (str): Options are simple, legacy, detailed, ondocument, and
-                standard.
-        """
-        self._preset_config = preset_config
-        self.client = client
-        self.json_config = {"preset_config": preset_config}
-        if custom_config:
-            self.json_config = custom_config
-
-    def run_ocr(
-        self, filepaths: List[str], text_setting: str = None
-    ) -> List[Union[StandardOcr, OnDoc, CustomOcr, str]]:
-        """
-        Args:
-            filepaths (List[str]): List of paths to local documents you would like to
-                submit for extraction
-            text_setting (str): Options are full_text and page_texts.
-
-        Returns:
-            extracted_data (List[Union[StandardOcr, OnDoc, CustomOcr, str]]): data from
-                DocumentExtraction converted to OCR objects or string text
-        """
-        jobs = self._submit_to_ocr(filepaths)
-        extracted_data = []
-        for ind, job in enumerate(jobs):
-            status = self.get_job_status(job.id, True)
-            if status.status == "SUCCESS":
-                result = self.get_storage_object(status.result)
-                if text_setting == "full_text":
-                    extracted_data.append(self._convert_ocr_objects(result).full_text)
-                elif text_setting == "page_texts":
-                    extracted_data.append(self._convert_ocr_objects(result).page_texts)
-                else:
-                    extracted_data.append(self._convert_ocr_objects(result))
-            else:
-                raise RuntimeError(
-                    f"{filepaths[ind]} {status.status}: {status.result}."
-                )
-        return extracted_data
-
-    def _submit_to_ocr(self, filepaths: List[str]) -> List[Job]:
-        return self.client.call(
-            DocumentExtraction(files=filepaths, json_config=self.json_config)
-        )
-
-    def _convert_ocr_objects(
-        self, extracted_data: Union[List[dict], dict]
-    ) -> Union[StandardOcr, OnDoc, CustomOcr]:
-        if self.json_config == {"preset_config": "ondocument"}:
-            return OnDoc(extracted_data)
-        elif (
-            self.json_config == {"preset_config": "standard"}
-            or self.json_config is None
-        ):
-            return StandardOcr(extracted_data)
-        else:
-            return CustomOcr(extracted_data)
diff --git a/indico_toolkit/indico_wrapper/reviewer.py b/indico_toolkit/indico_wrapper/reviewer.py
deleted file mode 100644
index faf6e9a4..00000000
--- a/indico_toolkit/indico_wrapper/reviewer.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import json
-
-from indico import IndicoClient
-
-from .indico_wrapper import Workflow
-
-
-class Reviewer(Workflow):
-    """
-    Class to simulate human reviewer
-    """
-
-    def __init__(
-        self,
-        client: IndicoClient,
-        workflow_id: int,
-    ):
-        self.client = client
-        self.workflow_id = workflow_id
-
-    def accept_review(self, submission_id: int, changes: dict) -> None:
-        """
-        Accept a submission in the review queue
-        Args:
-            submission_id (int): submission ID
-            changes (dict): accepted predictions with format like,
-                e.g. {"model_name": [{"label"...}]}
-        """
-        self.graphQL_request(
-            SUBMIT_REVIEW,
-            {
-                "rejected": False,
-                "submissionId": submission_id,
-                "changes": json.dumps(changes),
-            },
-        )
-
-    def get_random_review_id(self):
-        response = self.graphQL_request(
-            GET_RANDOM_REVIEW_ID, {"workflowId": self.workflow_id}
-        )
-        try:
-            return response["randomSubmission"]["id"]
-        except Exception:
-            raise RuntimeError("The review queue is empty")
-
-    def get_random_exception_id(self):
-        response = self.graphQL_request(
-            GET_RANDOM_EXCEPTION_ID, {"workflowId": self.workflow_id}
-        )
-        try:
-            return response["randomSubmission"]["id"]
-        except Exception:
-            raise RuntimeError("The exception queue is empty")
-
-    def reject_submission(self, submission_id):
-        return self.graphQL_request(
-            SUBMIT_REVIEW, {"rejected": True, "submissionId": submission_id}
-        )
-
-
-SUBMIT_REVIEW = """
-mutation submitStandardQueue(
-    $changes: JSONString,
-    $rejected: Boolean,
-    $submissionId: Int!,
-    $notes: String
-) {
-  submitReview(
-    changes: $changes,
-    rejected: $rejected,
-    submissionId: $submissionId,
-    notes: $notes
-) {
-    id
-    __typename
-  }
-}
-"""
-
-GET_RANDOM_EXCEPTION_ID = """
-query getExceptionsSubmission($workflowId: Int!) {
-  randomSubmission(adminReview: true, workflowId: $workflowId) {
-    id
-    resultFile
-    inputFilename
-    autoReview {
-      id
-      changes
-      __typename
-    }
-    __typename
-  }
-}
-"""
-
-GET_RANDOM_REVIEW_ID = """
-query getSubmission($workflowId: Int!) {
-  randomSubmission(adminReview: false, workflowId: $workflowId) {
-    id
-    resultFile
-    inputFilename
-    autoReview {
-      id
-      changes
-      __typename
-    }
-    __typename
-  }
-}
-"""
diff --git a/indico_toolkit/ocr/__init__.py b/indico_toolkit/ocr/__init__.py
index fbf15bf5..7d728eac 100644
--- a/indico_toolkit/ocr/__init__.py
+++ b/indico_toolkit/ocr/__init__.py
@@ -1,9 +1,7 @@
-from .customocr_object import CustomOcr
 from .ondoc_object import OnDoc
 from .standard_object import StandardOcr
 
 __all__ = (
-    "CustomOcr",
     "OnDoc",
     "StandardOcr",
 )
diff --git a/indico_toolkit/ocr/customocr_object.py b/indico_toolkit/ocr/customocr_object.py
deleted file mode 100644
index 78e1c79c..00000000
--- a/indico_toolkit/ocr/customocr_object.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from typing import List, Union
-
-
-class CustomOcr:
-    """
-    CustomOcr is a helper class for the raw preset config OCR results. Enables easy
-    extraction of full text and page-level text.
-    """
-
-    def __init__(self, customocr: Union[List[dict], dict]):
-        """
-        customocr Union[List[dict], dict]: result object from
-        indico.queries.DocumentExtraction
-        """
-        self.customocr = customocr
-
-    @property
-    def full_text(self) -> str:
-        """
-        Return full document text as string
-        """
-        if isinstance(self.customocr, dict) and "text" in self.customocr:
-            return self.customocr["text"]
-        elif isinstance(self.customocr, dict) and "pages" in self.customocr:
-            if "text" in self.customocr["pages"][0]:
-                return "\n".join(page["text"] for page in self.customocr["pages"])
-        elif isinstance(self.customocr, list) and "pages" in self.customocr[0]:
-            if "text" in self.customocr[0]["pages"][0]:
-                return "\n".join(page["pages"][0]["text"] for page in self.customocr)
-        raise RuntimeError("JSON configuration setting does not have full text.")
-
-    @property
-    def page_texts(self) -> List[str]:
-        """
-        Return list of page-level text
-        """
-        if isinstance(self.customocr, dict) and "pages" in self.customocr:
-            return [page["text"] for page in self.customocr["pages"]]
-        elif isinstance(self.customocr, list) and "pages" in self.customocr[0]:
-            if "text" in self.customocr[0]["pages"][0]:
-                return [page["pages"][0]["text"] for page in self.customocr]
-        raise RuntimeError("JSON configuration setting does not have page-level text.")
diff --git a/mypy.ini b/mypy.ini
index a205789e..fd98fdd6 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -18,9 +18,6 @@ ignore_missing_imports = True
 [mypy-indico_toolkit.association.*]
 ignore_errors = True
 
-[mypy-indico_toolkit.auto_populate.*]
-ignore_errors = True
-
 [mypy-indico_toolkit.auto_review.*]
 ignore_errors = True
 
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index b0c3d356..ec91e109 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -6,14 +6,11 @@
     AddModelGroupComponent,
     CreateDataset,
     CreateWorkflow,
-    DocumentExtraction,
     GetTrainingModelWithProgress,
     GraphQLRequest,
-    JobStatus,
-    RetrieveStorageObject,
 )
 
-from indico_toolkit.indico_wrapper import DocExtraction, Workflow
+from indico_toolkit.indico_wrapper import Workflow
 
 
 def pytest_addoption(parser: pytest.Parser) -> None:
@@ -62,11 +59,6 @@ def dataset_id(dataset):
     return dataset.id
 
 
-@pytest.fixture(scope="session")
-def doc_extraction_standard(indico_client):
-    return DocExtraction(indico_client)
-
-
 @pytest.fixture(scope="session")
 def extraction_model_group_id(workflow):
     return workflow.components[-1].model_group.id
@@ -102,18 +94,6 @@ def module_submission_ids(workflow_id, indico_client, pdf_file):
     return sub_ids
 
 
-@pytest.fixture(scope="session")
-def ondoc_ocr_object(indico_client, pdf_file):
-    job = indico_client.call(
-        DocumentExtraction(
-            files=[pdf_file], json_config={"preset_config": "ondocument"}
-        )
-    )
-    job = indico_client.call(JobStatus(id=job[0].id, wait=True))
-    extracted_data = indico_client.call(RetrieveStorageObject(job.result))
-    return extracted_data
-
-
 @pytest.fixture(scope="session")
 def pdf_file(tests_folder: Path) -> Path:
     return tests_folder / "data/samples/fin_disc.pdf"
@@ -124,17 +104,6 @@ def populator_snapshot_file(tests_folder: Path) -> Path:
     return tests_folder / "data/snapshots/populator_snapshot.csv"
 
 
-@pytest.fixture(scope="session")
-def standard_ocr_object(indico_client, pdf_file):
-    # TODO: this can be static-- probably should be "ondoc" as well
-    job = indico_client.call(
-        DocumentExtraction(files=[pdf_file], json_config={"preset_config": "standard"})
-    )
-    job = indico_client.call(JobStatus(id=job[0].id, wait=True))
-    extracted_data = indico_client.call(RetrieveStorageObject(job.result))
-    return extracted_data
-
-
 @pytest.fixture(scope="session")
 def teach_task_id(workflow):
     return workflow.components[-1].model_group.questionnaire_id
diff --git a/tests/integration/indico_wrapper/test_dataset.py b/tests/integration/indico_wrapper/test_dataset.py
deleted file mode 100644
index 2b785071..00000000
--- a/tests/integration/indico_wrapper/test_dataset.py
+++ /dev/null
@@ -1,40 +0,0 @@
-"""
-Test Datasets class methods
-"""
-
-import pytest
-from indico.types import Dataset
-
-from indico_toolkit.indico_wrapper import Datasets
-
-
-@pytest.fixture(scope="module")
-def dataset_wrapper(indico_client):
-    return Datasets(indico_client)
-
-
-def test_get_dataset(dataset_wrapper, dataset_id):
-    dataset = dataset_wrapper.get_dataset(dataset_id)
-    assert isinstance(dataset, Dataset)
-
-
-def test_add_to_dataset(dataset_wrapper, dataset_id, pdf_file):
-    dataset = dataset_wrapper.add_files_to_dataset(dataset_id, filepaths=[pdf_file])
-    assert isinstance(dataset, Dataset)
-    for f in dataset.files:
-        assert f.status in ["PROCESSED", "FAILED"]
-
-
-def test_get_dataset_files(dataset_wrapper, dataset_id):
-    files_list = dataset_wrapper.get_dataset_metadata(dataset_id)
-    assert isinstance(files_list, list)
-    assert len(files_list) > 0
-
-
-def test_create_delete_dataset(dataset_wrapper, pdf_file):
-    dataset = dataset_wrapper.create_dataset(
-        filepaths=[pdf_file], dataset_name="Toolkit Integration Tests"
-    )
-    assert isinstance(dataset, Dataset)
-    status = dataset_wrapper.delete_dataset(dataset.id)
-    assert status
diff --git a/tests/integration/indico_wrapper/test_doc_extraction.py b/tests/integration/indico_wrapper/test_doc_extraction.py
deleted file mode 100644
index 7917f68d..00000000
--- a/tests/integration/indico_wrapper/test_doc_extraction.py
+++ /dev/null
@@ -1,127 +0,0 @@
-from indico_toolkit.indico_wrapper import DocExtraction
-from indico_toolkit.ocr import OnDoc, StandardOcr
-
-
-def test_run_ocr_ondoc(indico_client, pdf_file):
-    doc_extraction_ondoc = DocExtraction(indico_client, preset_config="ondocument")
-    extracted_data = doc_extraction_ondoc.run_ocr(filepaths=[pdf_file])
-    for item in extracted_data:
-        assert isinstance(item, OnDoc)
-
-
-def test_run_ocr_standard(doc_extraction_standard, pdf_file):
-    extracted_data = doc_extraction_standard.run_ocr(filepaths=[pdf_file])
-    for item in extracted_data:
-        assert isinstance(item, StandardOcr)
-
-
-def test_run_ocr_standard_full_text(doc_extraction_standard, pdf_file):
-    full_text_result = doc_extraction_standard.run_ocr(
-        filepaths=[pdf_file], text_setting="full_text"
-    )
-    assert len(full_text_result[0]) == 2062
-
-
-def test_run_ocr_standard_page_texts(doc_extraction_standard, pdf_file):
-    page_texts_result = doc_extraction_standard.run_ocr(
-        filepaths=[pdf_file], text_setting="page_texts"
-    )
-    assert len(page_texts_result[0][0]) == 1153
-
-
-def test_run_ocr_custom_full_text(indico_client, pdf_file):
-    doc_extraction_custom = DocExtraction(
-        indico_client,
-        custom_config={
-            "top_level": "page",
-            "nest": False,
-            "reblocking": ["style", "list", "inline-header"],
-            "pages": [
-                "text",
-                "size",
-                "dpi",
-                "doc_offset",
-                "page_num",
-                "image",
-                "thumbnail",
-            ],
-            "blocks": [
-                "text",
-                "doc_offset",
-                "page_offset",
-                "position",
-                "block_type",
-                "page_num",
-            ],
-            "tokens": [
-                "text",
-                "doc_offset",
-                "page_offset",
-                "block_offset",
-                "position",
-                "page_num",
-                "style",
-            ],
-            "chars": [
-                "text",
-                "doc_index",
-                "block_index",
-                "page_index",
-                "page_num",
-                "position",
-            ],
-        },
-    )
-    full_text_result = doc_extraction_custom.run_ocr(
-        filepaths=[pdf_file], text_setting="full_text"
-    )
-    assert len(full_text_result[0]) == 2067
-
-
-def test_run_ocr_custom_page_texts(indico_client, pdf_file):
-    doc_extraction_custom = DocExtraction(
-        indico_client,
-        custom_config={
-            "top_level": "page",
-            "nest": False,
-            "reblocking": ["style", "list", "inline-header"],
-            "pages": [
-                "text",
-                "size",
-                "dpi",
-                "doc_offset",
-                "page_num",
-                "image",
-                "thumbnail",
-            ],
-            "blocks": [
-                "text",
-                "doc_offset",
-                "page_offset",
-                "position",
-                "block_type",
-                "page_num",
-            ],
-            "tokens": [
-                "text",
-                "doc_offset",
-                "page_offset",
-                "block_offset",
-                "position",
-                "page_num",
-                "style",
-            ],
-            "chars": [
-                "text",
-                "doc_index",
-                "block_index",
-                "page_index",
-                "page_num",
-                "position",
-            ],
-        },
-    )
-    page_texts_result = doc_extraction_custom.run_ocr(
-        filepaths=[pdf_file], text_setting="page_texts"
-    )
-    assert len(page_texts_result[0][0]) == 1158
diff --git a/tests/integration/indico_wrapper/test_reviewer.py b/tests/integration/indico_wrapper/test_reviewer.py
deleted file mode 100644
index 9b26a4ce..00000000
--- a/tests/integration/indico_wrapper/test_reviewer.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import pytest
-
-from indico_toolkit.indico_wrapper import Reviewer, Workflow
-
-
-@pytest.fixture(scope="module")
-def submissions_awaiting_review(workflow_id, indico_client, pdf_file):
-    """
-    Ensure that auto review is turned off and there are two submissions "PENDING_REVIEW"
-    """
-    workflow_wrapper = Workflow(indico_client)
-    workflow_wrapper.update_workflow_settings(
-        workflow_id, enable_review=True, enable_auto_review=False
-    )
-    sub_ids = workflow_wrapper.submit_documents_to_workflow(
-        workflow_id, files=[pdf_file, pdf_file]
-    )
-    workflow_wrapper.wait_for_submissions_to_process(sub_ids)
-
-
-def get_change_formatted_predictions(workflow_result):
-    """
-    Helper function for get change format for accepted predictions in test_accept_review
-    """
-    return {workflow_result.model_name: workflow_result.get_predictions.to_list()}
-
-
-@pytest.mark.skip(reason="broken on indico-client>=6.1.0")
-def test_accept_review(submissions_awaiting_review, indico_client, workflow_id):
-    reviewer_wrapper = Reviewer(indico_client, workflow_id)
-    id_in_review = reviewer_wrapper.get_random_review_id()
-    submission = reviewer_wrapper.get_submission_object(id_in_review)
-    assert submission.status == "PENDING_REVIEW"
-    predictions = reviewer_wrapper.get_submission_results_from_ids([id_in_review])
-    changes = get_change_formatted_predictions(predictions[0])
-    reviewer_wrapper.accept_review(id_in_review, changes)
-    submission = reviewer_wrapper.get_submission_object(id_in_review)
-    assert submission.status == "COMPLETE"
-
-
-@pytest.mark.skip(reason="flaky, depends on submission processing time")
-def test_reject_from_review(submissions_awaiting_review, indico_client, workflow_id):
-    reviewer_wrapper = Reviewer(indico_client, workflow_id)
-    id_in_review = reviewer_wrapper.get_random_review_id()
-    reviewer_wrapper.reject_submission(id_in_review)
-    submission = reviewer_wrapper.get_submission_object(id_in_review)
-    assert submission.status == "PENDING_ADMIN_REVIEW"
-
-
-@pytest.mark.skip(reason="flaky, depends on submission processing time")
-def test_reject_from_admin_review(
-    submissions_awaiting_review, indico_client, workflow_id
-):
-    reviewer_wrapper = Reviewer(indico_client, workflow_id)
-    id_in_exception = reviewer_wrapper.get_random_exception_id()
-    submission = reviewer_wrapper.get_submission_object(id_in_exception)
-    assert submission.status == "PENDING_ADMIN_REVIEW"
-    reviewer_wrapper.reject_submission(id_in_exception)
-    submission = reviewer_wrapper.get_submission_object(id_in_exception)
-    assert submission.status == "COMPLETE"
diff --git a/tests/integration/ocr/__init__.py b/tests/integration/ocr/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests/integration/ocr/test_customocr_object.py b/tests/integration/ocr/test_customocr_object.py
deleted file mode 100644
index 078ca7f2..00000000
--- a/tests/integration/ocr/test_customocr_object.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import pytest
-
-from indico_toolkit.indico_wrapper import DocExtraction
-
-
-def test_full_text(indico_client, pdf_file):
-    doc_extraction = DocExtraction(indico_client, preset_config="simple")
-    custom_ocr = doc_extraction.run_ocr(filepaths=[pdf_file])
-    assert len(custom_ocr[0].full_text) == 2823
-
-
-def test_full_text_exception(indico_client, pdf_file):
-    doc_extraction = DocExtraction(
-        indico_client,
-        custom_config={
-            "nest": True,
-            "top_level": "document",
-            "native_pdf": True,
-            "blocks": ["text", "position", "doc_offset", "page_offset"],
-        },
-    )
-    custom_ocr = doc_extraction.run_ocr(filepaths=[pdf_file])
-    with pytest.raises(Exception):
-        custom_ocr[0].full_text
-
-
-def test_page_texts(indico_client, pdf_file):
-    doc_extraction = DocExtraction(
-        indico_client,
-        custom_config={
-            "nest": True,
-            "top_level": "document",
-            "native_pdf": True,
-            "pages": ["text", "size", "dpi", "doc_offset", "page_num", "image"],
-            "blocks": ["text", "position", "doc_offset", "page_offset"],
-        },
-    )
-    custom_ocr = doc_extraction.run_ocr(filepaths=[pdf_file])
-    assert isinstance(custom_ocr[0].page_texts, list)
-    assert isinstance(custom_ocr[0].page_texts[0], str)
-
-
-def test_page_texts_exception(indico_client, pdf_file):
-    doc_extraction = DocExtraction(indico_client, preset_config="legacy")
-    custom_ocr = doc_extraction.run_ocr(filepaths=[pdf_file])
-    with pytest.raises(Exception):
-        custom_ocr.page_texts
diff --git a/tests/integration/ocr/test_ondoc_object.py b/tests/integration/ocr/test_ondoc_object.py
deleted file mode 100644
index e5f49c6b..00000000
--- a/tests/integration/ocr/test_ondoc_object.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import pytest
-
-from indico_toolkit.ocr import OnDoc
-
-
-def test_ondoc_full_text(ondoc_ocr_object):
-    ondoc_ocr = OnDoc(ondoc_ocr_object)
-    assert len(ondoc_ocr.full_text) == 2067
-
-
-def test_ondoc_page_texts(ondoc_ocr_object):
-    ondoc_ocr = OnDoc(ondoc_ocr_object)
-    assert len(ondoc_ocr.page_texts) == 2
-    assert len(ondoc_ocr.page_texts[0]) == 1158
-
-
-def test_ondoc_page_results(ondoc_ocr_object):
-    ondoc_ocr = OnDoc(ondoc_ocr_object)
-    assert len(ondoc_ocr.page_results) == 2
-    assert len(ondoc_ocr.page_results[0]) == 8
-
-
-def test_ondoc_block_texts(ondoc_ocr_object):
-    ondoc_ocr = OnDoc(ondoc_ocr_object)
-    assert len(ondoc_ocr.block_texts) == 41
-
-
-def test_ondoc_token_objects(ondoc_ocr_object):
-    ondoc_ocr = OnDoc(ondoc_ocr_object)
-    assert len(ondoc_ocr.token_objects) == 304
-
-
-def test_ondoc_total_pages(ondoc_ocr_object):
-    ondoc_ocr = OnDoc(ondoc_ocr_object)
-    assert ondoc_ocr.total_pages == 2
-
-
-def test_ondoc_total_characters(ondoc_ocr_object):
-    ondoc_ocr = OnDoc(ondoc_ocr_object)
-    assert ondoc_ocr.total_characters == 2067
-
-
-def test_ondoc_total_tokens(ondoc_ocr_object):
-    ondoc_ocr = OnDoc(ondoc_ocr_object)
-    assert ondoc_ocr.total_tokens == 304
-
-
-def test_ondoc_confidence(ondoc_ocr_object):
-    ondoc_ocr = OnDoc(ondoc_ocr_object)
-    assert isinstance(ondoc_ocr.ocr_confidence("mean"), float)
-    assert 1 <= ondoc_ocr.ocr_confidence("mean") <= 100
-
-
-def test_ondoc_confidence_metric_exception(ondoc_ocr_object):
-    ondoc_ocr = OnDoc(ondoc_ocr_object)
-    with pytest.raises(Exception):
-        ondoc_ocr.ocr_confidence("average")
-
-
-def test_ondoc_excluded_confidence_exception(ondoc_ocr_object):
-    ondoc_ocr = OnDoc(ondoc_ocr_object[0]["chars"][0].pop("confidence"))
-    with pytest.raises(Exception):
-        ondoc_ocr.ocr_confidence("mean")
diff --git a/tests/integration/ocr/test_standard_object.py b/tests/integration/ocr/test_standard_object.py
deleted file mode 100644
index 4753b8da..00000000
--- a/tests/integration/ocr/test_standard_object.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from indico_toolkit.ocr import StandardOcr
-
-
-def test_standard_object_full_text(standard_ocr_object):
-    standard_ocr = StandardOcr(standard_ocr_object)
-    assert len(standard_ocr.full_text) == 2062
-
-
-def test_standard_object_page_texts(standard_ocr_object):
-    standard_ocr = StandardOcr(standard_ocr_object)
-    assert len(standard_ocr.page_texts) == 2
-    assert len(standard_ocr.page_texts[0]) == 1153
-
-
-def test_standard_object_page_results(standard_ocr_object):
-    standard_ocr = StandardOcr(standard_ocr_object)
-    assert len(standard_ocr.page_results) == 2
-    assert len(standard_ocr.page_results[0]) == 4
-
-
-def test_standard_object_block_texts(standard_ocr_object):
-    standard_ocr = StandardOcr(standard_ocr_object)
-    assert len(standard_ocr.block_texts) == 36
-
-
-def test_standard_object_total_pages(standard_ocr_object):
-    standard_ocr = StandardOcr(standard_ocr_object)
-    assert standard_ocr.total_pages == 2
-
-
-def test_standard_object_total_characters(standard_ocr_object):
-    standard_ocr = StandardOcr(standard_ocr_object)
-    assert standard_ocr.total_characters == 2062
diff --git a/tests/integration/test_populator.py b/tests/integration/test_populator.py
deleted file mode 100644
index bd2981fd..00000000
--- a/tests/integration/test_populator.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import json
-import os
-
-import pytest
-from indico.queries import GetWorkflow
-from indico.types import Workflow
-
-from indico_toolkit.auto_populate import AutoPopulator
-from indico_toolkit.auto_populate.types import LabelInput, LabelInst
-
-pd = pytest.importorskip("pandas")
-
-
-@pytest.fixture(scope="function")
-def static_file_to_targets(populator_snapshot_file):
-    df = pd.read_csv(populator_snapshot_file)
-    file_to_targets = {}
-    for file, target in zip(
-        df["file_name_1820"].to_list(), df["Toolkit Test Financial Model"].to_list()
-    ):
-        if not isinstance(target, float):
-            file_to_targets[file] = json.loads(target)["targets"]
-    return file_to_targets
-
-
-def test_create_classification_workflow(indico_client, tests_folder):
-    auto_populator = AutoPopulator(indico_client)
-    new_workflow = auto_populator.create_auto_classification_workflow(
-        os.path.join(tests_folder, "data/auto_class"),
-        "My dataset",
-        "My workflow",
-        "My teach task",
-    )
-    assert isinstance(new_workflow, Workflow)
-
-
-def test_create_classification_workflow_too_few_classes(indico_client, tests_folder):
-    auto_populator = AutoPopulator(indico_client)
-    with pytest.raises(Exception):
-        auto_populator.create_auto_classification_workflow(
-            os.path.join(tests_folder, "data/auto_class/class_a/"),
-            "My dataset",
-            "My workflow",
-            "My teach task",
-        )
-
-
-def test_copy_teach_task(indico_client, dataset, workflow_id, teach_task_id):
-    auto_populator = AutoPopulator(indico_client)
-    original_workflow = indico_client.call(GetWorkflow(workflow_id))
-    new_workflow = auto_populator.copy_teach_task(
-        dataset_id=dataset.id,
-        teach_task_id=teach_task_id,
-        workflow_name=f"{original_workflow.name}_Copied",
-        data_column="text",
-    )
-    assert isinstance(new_workflow, Workflow)
-
-
-def test_get_labels_by_filename(
-    indico_client,
-    extraction_model_group_id,
-    teach_task_id,
-    static_file_to_targets,
-):
-    populator = AutoPopulator(indico_client)
-    (
-        labelset_id,
-        model_group_id,
-        target_name_map,
-    ) = populator._get_teach_task_details(teach_task_id)
-
-    labels = populator.get_labels_by_filename(
-        extraction_model_group_id, static_file_to_targets, target_name_map
-    )
-    assert len(labels) != 0
-    for label in labels:
-        assert isinstance(label, LabelInput)
-        for target in label.targets:
-            assert isinstance(target, LabelInst)