From 2a351b65142508b892049be9fb994447915781d7 Mon Sep 17 00:00:00 2001
From: Drew Herbst <aherbst@broadinstitute.org>
Date: Fri, 8 Oct 2021 09:15:21 -0400
Subject: [PATCH 1/2] Biosample updates

Worm donor

Add new donor tables

Update transformations

Update transformations

Remove unneeded fields

Update documentation (#74)

Update README.md

Remove test files

New fields

Revert changes
---
 .gitignore                                    |  1 +
 ARCHITECTURE.md                               | 41 ++++++++++
 README.md                                     | 71 +++++++---------
 .../src/main/jade-tables/biosample.table.json | 52 ++++++++++++
 .../src/main/jade-tables/fly_donor.table.json | 66 +++++++++++++++
 .../src/main/jade-tables/library.table.json   | 33 ++++++++
 .../main/jade-tables/manatee_donor.table.json | 65 +++++++++++++++
 .../main/jade-tables/mouse_donor.table.json   | 66 +++++++++++++++
 .../main/jade-tables/worm_donor.table.json    | 82 +++++++++++++++++++
 9 files changed, 436 insertions(+), 41 deletions(-)
 create mode 100644 ARCHITECTURE.md
 create mode 100644 schema/src/main/jade-tables/fly_donor.table.json
 create mode 100644 schema/src/main/jade-tables/manatee_donor.table.json
 create mode 100644 schema/src/main/jade-tables/mouse_donor.table.json
 create mode 100644 schema/src/main/jade-tables/worm_donor.table.json

diff --git a/.gitignore b/.gitignore
index 8a121f3f..4d37ed89 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ target/
 project/metals.sbt
 Chart.lock
 charts/
+.DS_Store
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
new file mode 100644
index 00000000..83b45a2d
--- /dev/null
+++ b/ARCHITECTURE.md
@@ -0,0 +1,41 @@
+# ENCODE Ingest Architecture
+
+## Schema Design
+The schema used for this dataset was almost entirely inspired by the Terra
+Interoperability Model (TIM). JSON definitions of the resulting tables can be
+found under [`schema/`](./schema). The schema is purposefully very different
+from the source ENCODE schema: it is sample-centric (instead of experiment-centric),
+merges & splits ENCODE entities as needed, and does not attempt to capture all
+of the information published in the source data.
+![Data model](./images/data-model.jpg)
+
+## Pipeline Architecture
+The ingest pipeline is orchestrated through Argo, with most data processing logic
+delegated to Dataflow and BigQuery. The high level flow looks like:
+![Architecture diagram](./images/encode-orch.png)
+
+NOTE: Only metadata ingest is implemented. The system still needs to be extended
+to ingest data files from ENCODE's S3 archives.
+
+### Extracting Metadata
+Metadata extraction is the first step of ENCODE ingest. The extraction component is
+written as a Dataflow pipeline, which begins by querying the metadata of all Biosample
+entities from human donors. From there, the pipeline traverses known foreign-key
+fields to scrape the rest of the metadata we care about:
+![Extraction graph](./images/encode-extract.png)
+
+Some important things to note:
+1. The initial query relies on there being relatively few Biosample entities in ENCODE's
+   system, as they all must be returned in a single response payload. Performance may
+   degrade as ENCODE continues to grow.
+2. The links we follow between different entity types were chosen based on non-scientific
+   experimentation and performance-testing. If the number of entities in a type grows
+   significantly, performance migth degrade / the pipeline might crash.
+
+### Transforming Metadata
+Metadata transformation is also implemented as a Dataflow pipeline. Instead of one large
+DAG, this step can be viewed as a collection of tiny processing trees that just happen to
+run at the same time:
+![Transformation trees](./images/encode-transform.png)
+
+Note that some raw entities are used to contribute to multiple output tables.
diff --git a/README.md b/README.md
index ecefac92..1d798ee8 100644
--- a/README.md
+++ b/README.md
@@ -1,42 +1,31 @@
 # ENCODE Ingest
-Batch ETL pipeline to mirror ENCODE data into the Terra Data Repository (TDR).
-
-## Schema Design
-The schema used for this dataset was almost entirely inspired by the Terra
-Interoperability Model (TIM). JSON definitions of the resulting tables can be
-found under [`schema/`](./schema). The schema is purposefully very different
-from the source ENCODE schema: it is sample-centric (instead of experiment-centric),
-merges & splits ENCODE entities as needed, and does not attempt to capture all
-of the information published in the source data.
-![Data model](./images/data-model.jpg)
-
-## Pipeline Architecture
-The ingest pipeline is orchestrated through Argo, with most data processing logic
-delegated to Dataflow and BigQuery. The high level flow looks like:
-![Architecture diagram](./images/encode-orch.png)
-
-NOTE: Only metadata ingest is implemented. The system still needs to be extended
-to ingest data files from ENCODE's S3 archives.
-
-### Extracting Metadata
-Metadata extraction is the first step of ENCODE ingest. The extraction component is
-written as a Dataflow pipeline, which begins by querying the metadata of all Biosample
-entities from human donors. From there, the pipeline traverses known foreign-key
-fields to scrape the rest of the metadata we care about:
-![Extraction graph](./images/encode-extract.png)
-
-Some important things to note:
-1. The initial query relies on there being relatively few Biosample entities in ENCODE's
-   system, as they all must be returned in a single response payload. Performance may
-   degrade as ENCODE continues to grow.
-2. The links we follow between different entity types were chosen based on non-scientific
-   experimentation and performance-testing. If the number of entities in a type grows
-   significantly, performance migth degrade / the pipeline might crash.
-
-### Transforming Metadata
-Metadata transformation is also implemented as a Dataflow pipeline. Instead of one large
-DAG, this step can be viewed as a collection of tiny processing trees that just happen to
-run at the same time:
-![Transformation trees](./images/encode-transform.png)
-
-Note that some raw entities are used to contribute to multiple output tables.
+
+Batch ETL pipeline to mirror ENCODE data into the Terra Data Repository (TDR). See the [architecture documentation](https://github.com/DataBiosphere/encode-ingest/blob/master/ARCHITECTURE.md) for 
+further design details.
+
+## Getting Started
+
+Orchestration of the ETL flows in this project is implemented using [Argo Workflows](https://argoproj.github.io/argo-workflows/).
+
+The core extraction and transformation data pipelines are implemented in [Scio](https://spotify.github.io/scio/) on top of Apache Beam.
+
+After cloning the repository, ensure you can compile the code, auto-generate schema classes
+and run the test suite from the repository root:
+
+`sbt test`
+
+## Development Process
+
+All development should be done on branches off of the protected `master` branch. After review, merge to `master`
+and then follow the instuctions in the [monster-deploy repo](https://github.com/broadinstitute/monster-deploy/)
+
+When modifying the Scio data pipelines, it's possible to run the pipeline locally by invoking the relevant pipeline:
+
+* **Extraction:** 
+
+`sbt "encode-extraction / runMain org.broadinstitute.monster.encode.extraction.ExtractionPipeline  --outputDir=<some local directory>"`
+* **Transformation** 
+
+`sbt "encode-transformation-pipeline / runMain org.broadinstitute.monster.encode.transformation.TransformationPipeline --inputPrefix=<extraction dir> --outputPrefix=<output dir>"`
+
+Development of Argo changes requires deployment to the DEV environment as documented in the [monster-deploy repo](https://github.com/broadinstitute/monster-deploy/)
diff --git a/schema/src/main/jade-tables/biosample.table.json b/schema/src/main/jade-tables/biosample.table.json
index 2c63370b..33a74899 100644
--- a/schema/src/main/jade-tables/biosample.table.json
+++ b/schema/src/main/jade-tables/biosample.table.json
@@ -114,6 +114,58 @@
     {
       "name": "lot_id",
       "datatype": "string"
+    },
+    {
+      "name": "fly_life_stage",
+      "datatype": "string"
+    },
+    {
+      "name": "fly_synchronization_stage",
+      "datatype": "string"
+    },
+    {
+      "name": "model_organism_age",
+      "datatype": "integer"
+    },
+    {
+      "name": "model_organism_age_unit",
+      "datatype": "string"
+    },
+    {
+      "name": "mouse_life_stage",
+      "datatype": "string"
+    },
+    {
+      "name": "origin_batch",
+      "datatype": "string"
+    },
+    {
+      "name": "passage_number",
+      "datatype": "integer"
+    },
+    {
+      "name": "post_nucleic_acid_delivery_time",
+      "datatype": "integer"
+    },
+    {
+      "name": "post_nucleic_acid_delivery_time_unit",
+      "datatype": "string"
+    },
+    {
+      "name": "pulse_chase_time",
+      "datatype": "integer"
+    },
+    {
+      "name": "pulse_chase_time_unit",
+      "datatype": "string"
+    },
+    {
+      "name": "worm_life_stage",
+      "datatype": "string"
+    },
+    {
+      "name": "worm_synchronization_stage",
+      "datatype": "string"
     }
   ]
 }
diff --git a/schema/src/main/jade-tables/fly_donor.table.json b/schema/src/main/jade-tables/fly_donor.table.json
new file mode 100644
index 00000000..abe2d24b
--- /dev/null
+++ b/schema/src/main/jade-tables/fly_donor.table.json
@@ -0,0 +1,66 @@
+{
+  "name": "fly_donor",
+  "columns": [
+    {
+      "name": "id",
+      "datatype": "string",
+      "type": "primary_key"
+    },
+    {
+      "name": "cross_references",
+      "datatype": "string",
+      "type": "repeated"
+    },
+    {
+      "name": "time_created",
+      "datatype": "timestamp",
+      "type": "required"
+    },
+    {
+      "name": "organism",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "award",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "lab",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "submitted_by",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "source",
+      "datatype": "string"
+    },
+    {
+      "name": "genetic_modifications",
+      "datatype": "string",
+      "type": "repeated"
+    },
+    {
+      "name": "genotype",
+      "datatype": "string"
+    },
+    {
+      "name": "strain_background",
+      "datatype": "string"
+    },
+    {
+      "name": "strain_name",
+      "datatype": "string"
+    },
+    {
+      "name": "parent_strains",
+      "datatype": "string",
+      "type": "repeated"
+    }
+  ]
+}
diff --git a/schema/src/main/jade-tables/library.table.json b/schema/src/main/jade-tables/library.table.json
index 3ab1731d..07d1c960 100644
--- a/schema/src/main/jade-tables/library.table.json
+++ b/schema/src/main/jade-tables/library.table.json
@@ -80,6 +80,39 @@
     {
       "name": "prep_material_name",
       "datatype": "string"
+    },
+    {
+      "name": "construction_method",
+      "datatype": "string"
+    },
+    {
+      "name": "fragment_length_cv",
+      "datatype": "integer"
+    },
+    {
+      "name": "fragment_length_sd",
+      "datatype": "integer"
+    },
+    {
+      "name": "fragmentation_duration_time",
+      "datatype": "integer"
+    },
+    {
+      "name": "fragmentation_duration_time_unit",
+      "datatype": "string"
+    },
+    {
+      "name": "linkers",
+      "datatype": "string",
+      "type": "repeated"
+    },
+    {
+      "name": "mint_mixture_identifier",
+      "datatype": "string"
+    },
+    {
+      "name": "replicates",
+      "datatype": "string"
     }
   ]
 }
diff --git a/schema/src/main/jade-tables/manatee_donor.table.json b/schema/src/main/jade-tables/manatee_donor.table.json
new file mode 100644
index 00000000..558be6a8
--- /dev/null
+++ b/schema/src/main/jade-tables/manatee_donor.table.json
@@ -0,0 +1,65 @@
+{
+  "name": "manatee_donor",
+  "columns": [
+    {
+      "name": "id",
+      "datatype": "string",
+      "type": "primary_key"
+    },
+    {
+      "name": "cross_references",
+      "datatype": "string",
+      "type": "repeated"
+    },
+    {
+      "name": "time_created",
+      "datatype": "timestamp",
+      "type": "required"
+    },
+    {
+      "name": "age_min",
+      "datatype": "integer"
+    },
+    {
+      "name": "age_max",
+      "datatype": "integer"
+    },
+    {
+      "name": "age_unit",
+      "datatype": "string"
+    },
+    {
+      "name": "organism",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "award",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "lab",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "life_stage",
+      "datatype": "string"
+    },
+    {
+      "name": "submitted_by",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "source",
+      "datatype": "string"
+    },
+    {
+      "name": "genetic_modifications",
+      "datatype": "string",
+      "type": "repeated"
+    }
+  ]
+}
diff --git a/schema/src/main/jade-tables/mouse_donor.table.json b/schema/src/main/jade-tables/mouse_donor.table.json
new file mode 100644
index 00000000..6b42f1cc
--- /dev/null
+++ b/schema/src/main/jade-tables/mouse_donor.table.json
@@ -0,0 +1,66 @@
+{
+  "name": "mouse_donor",
+  "columns": [
+    {
+      "name": "id",
+      "datatype": "string",
+      "type": "primary_key"
+    },
+    {
+      "name": "cross_references",
+      "datatype": "string",
+      "type": "repeated"
+    },
+    {
+      "name": "time_created",
+      "datatype": "timestamp",
+      "type": "required"
+    },
+    {
+      "name": "organism",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "award",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "lab",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "submitted_by",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "source",
+      "datatype": "string"
+    },
+    {
+      "name": "genetic_modifications",
+      "datatype": "string",
+      "type": "repeated"
+    },
+    {
+      "name": "genotype",
+      "datatype": "string"
+    },
+    {
+      "name": "strain_background",
+      "datatype": "string"
+    },
+    {
+      "name": "strain_name",
+      "datatype": "string"
+    },
+    {
+      "name": "parent_strains",
+      "datatype": "string",
+      "type": "repeated"
+    }
+  ]
+}
diff --git a/schema/src/main/jade-tables/worm_donor.table.json b/schema/src/main/jade-tables/worm_donor.table.json
new file mode 100644
index 00000000..adbe541c
--- /dev/null
+++ b/schema/src/main/jade-tables/worm_donor.table.json
@@ -0,0 +1,82 @@
+{
+  "name": "worm_donor",
+  "columns": [
+    {
+      "name": "id",
+      "datatype": "string",
+      "type": "primary_key"
+    },
+    {
+      "name": "cross_references",
+      "datatype": "string",
+      "type": "repeated"
+    },
+    {
+      "name": "time_created",
+      "datatype": "timestamp",
+      "type": "required"
+    },
+    {
+      "name": "organism",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "award",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "lab",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "submitted_by",
+      "datatype": "string",
+      "type": "required"
+    },
+    {
+      "name": "source",
+      "datatype": "string"
+    },
+    {
+      "name": "genetic_modifications",
+      "datatype": "string",
+      "type": "repeated"
+    },
+    {
+      "name": "genotype",
+      "datatype": "string"
+    },
+    {
+      "name": "num_times_outcrossed",
+      "datatype": "integer"
+    },
+    {
+      "name": "outcrossed_strain",
+      "datatype": "string"
+    },
+    {
+      "name": "strain_background",
+      "datatype": "string"
+    },
+    {
+      "name": "strain_name",
+      "datatype": "string"
+    },
+    {
+      "name": "parent_strains",
+      "datatype": "string",
+      "type": "repeated"
+    },
+    {
+      "name": "url",
+      "datatype": "string"
+    },
+    {
+      "name": "uuid",
+      "datatype": "string"
+    }
+  ]
+}

From be6d831ba1b96907fbda71913820d780eac633cc Mon Sep 17 00:00:00 2001
From: Drew Herbst <aherbst@broadinstitute.org>
Date: Wed, 13 Oct 2021 09:46:04 -0400
Subject: [PATCH 2/2] Add boilerplate transformation code for new fields

---
 .../BiosampleTransformations.scala               | 16 +++++++++++++++-
 .../transformation/LibraryTransformations.scala  | 11 ++++++++++-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/BiosampleTransformations.scala b/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/BiosampleTransformations.scala
index 01a8f80a..db7f6361 100644
--- a/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/BiosampleTransformations.scala
+++ b/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/BiosampleTransformations.scala
@@ -68,7 +68,21 @@ object BiosampleTransformations {
         None
       } else {
         lotIds.headOption
-      }
+      },
+      /** TODO Implement once schema is frozen */
+      flyLifeStage = Some("ignore"),
+      flySynchronizationStage = Some("ignore"),
+      modelOrganismAge = Some(1L),
+      modelOrganismAgeUnit = Some("ignore"),
+      mouseLifeStage = Some("ignore"),
+      originBatch = Some("ignore"),
+      passageNumber = Some(1L),
+      postNucleicAcidDeliveryTime = Some(1L),
+      postNucleicAcidDeliveryTimeUnit = Some("ignore"),
+      pulseChaseTime = Some(1L),
+      pulseChaseTimeUnit = Some("ignore"),
+      wormLifeStage = Some("ignore"),
+      wormSynchronizationStage = Some("ignore")
     )
   }
 }
diff --git a/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/LibraryTransformations.scala b/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/LibraryTransformations.scala
index 61e35453..26c773e8 100644
--- a/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/LibraryTransformations.scala
+++ b/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/LibraryTransformations.scala
@@ -35,7 +35,16 @@ object LibraryTransformations {
       spikeIns = libraryInput.read[List[String]]("spikeins_used"),
       biosampleId = CommonTransformations.transformId(libraryInput.read[String]("biosample")),
       prepMaterial = libraryInput.tryRead[String]("nucleic_acid_term_id"),
-      prepMaterialName = libraryInput.tryRead[String]("nucleic_acid_term_name")
+      prepMaterialName = libraryInput.tryRead[String]("nucleic_acid_term_name"),
+      /** TODO Implement once schema is frozen */
+      constructionMethod = Some("ignore"),
+      fragmentLengthCv = Some(1L),
+      fragmentLengthSd = Some(1L),
+      fragmentationDurationTime = Some(1L),
+      fragmentationDurationTimeUnit = Some("ignore"),
+      linkers = List(),
+      mintMixtureIdentifier = Some("ignore"),
+      replicates = Some("ignore")
     )
   }