From 2a351b65142508b892049be9fb994447915781d7 Mon Sep 17 00:00:00 2001 From: Drew Herbst Date: Fri, 8 Oct 2021 09:15:21 -0400 Subject: [PATCH 1/2] Biosample updates Worm donor Add new donor tables Update transformations Update transformations Remove unneeded fields Update documentation (#74) Update README.md Remove test files New fields Revert changes --- .gitignore | 1 + ARCHITECTURE.md | 41 ++++++++++ README.md | 71 +++++++--------- .../src/main/jade-tables/biosample.table.json | 52 ++++++++++++ .../src/main/jade-tables/fly_donor.table.json | 66 +++++++++++++++ .../src/main/jade-tables/library.table.json | 33 ++++++++ .../main/jade-tables/manatee_donor.table.json | 65 +++++++++++++++ .../main/jade-tables/mouse_donor.table.json | 66 +++++++++++++++ .../main/jade-tables/worm_donor.table.json | 82 +++++++++++++++++++ 9 files changed, 436 insertions(+), 41 deletions(-) create mode 100644 ARCHITECTURE.md create mode 100644 schema/src/main/jade-tables/fly_donor.table.json create mode 100644 schema/src/main/jade-tables/manatee_donor.table.json create mode 100644 schema/src/main/jade-tables/mouse_donor.table.json create mode 100644 schema/src/main/jade-tables/worm_donor.table.json diff --git a/.gitignore b/.gitignore index 8a121f3f..4d37ed89 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ target/ project/metals.sbt Chart.lock charts/ +.DS_Store diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 00000000..83b45a2d --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,41 @@ +# ENCODE Ingest Architecture + +## Schema Design +The schema used for this dataset was almost entirely inspired by the Terra +Interoperability Model (TIM). JSON definitions of the resulting tables can be +found under [`schema/`](./schema). The schema is purposefully very different +from the source ENCODE schema: it is sample-centric (instead of experiment-centric), +merges & splits ENCODE entities as needed, and does not attempt to capture all +of the information published in the source data. +![Data model](./images/data-model.jpg) + +## Pipeline Architecture +The ingest pipeline is orchestrated through Argo, with most data processing logic +delegated to Dataflow and BigQuery. The high level flow looks like: +![Architecture diagram](./images/encode-orch.png) + +NOTE: Only metadata ingest is implemented. The system still needs to be extended +to ingest data files from ENCODE's S3 archives. + +### Extracting Metadata +Metadata extraction is the first step of ENCODE ingest. The extraction component is +written as a Dataflow pipeline, which begins by querying the metadata of all Biosample +entities from human donors. From there, the pipeline traverses known foreign-key +fields to scrape the rest of the metadata we care about: +![Extraction graph](./images/encode-extract.png) + +Some important things to note: +1. The initial query relies on there being relatively few Biosample entities in ENCODE's + system, as they all must be returned in a single response payload. Performance may + degrade as ENCODE continues to grow. +2. The links we follow between different entity types were chosen based on non-scientific + experimentation and performance-testing. If the number of entities in a type grows + significantly, performance migth degrade / the pipeline might crash. + +### Transforming Metadata +Metadata transformation is also implemented as a Dataflow pipeline. Instead of one large +DAG, this step can be viewed as a collection of tiny processing trees that just happen to +run at the same time: +![Transformation trees](./images/encode-transform.png) + +Note that some raw entities are used to contribute to multiple output tables. diff --git a/README.md b/README.md index ecefac92..1d798ee8 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,31 @@ # ENCODE Ingest -Batch ETL pipeline to mirror ENCODE data into the Terra Data Repository (TDR). - -## Schema Design -The schema used for this dataset was almost entirely inspired by the Terra -Interoperability Model (TIM). JSON definitions of the resulting tables can be -found under [`schema/`](./schema). The schema is purposefully very different -from the source ENCODE schema: it is sample-centric (instead of experiment-centric), -merges & splits ENCODE entities as needed, and does not attempt to capture all -of the information published in the source data. -![Data model](./images/data-model.jpg) - -## Pipeline Architecture -The ingest pipeline is orchestrated through Argo, with most data processing logic -delegated to Dataflow and BigQuery. The high level flow looks like: -![Architecture diagram](./images/encode-orch.png) - -NOTE: Only metadata ingest is implemented. The system still needs to be extended -to ingest data files from ENCODE's S3 archives. - -### Extracting Metadata -Metadata extraction is the first step of ENCODE ingest. The extraction component is -written as a Dataflow pipeline, which begins by querying the metadata of all Biosample -entities from human donors. From there, the pipeline traverses known foreign-key -fields to scrape the rest of the metadata we care about: -![Extraction graph](./images/encode-extract.png) - -Some important things to note: -1. The initial query relies on there being relatively few Biosample entities in ENCODE's - system, as they all must be returned in a single response payload. Performance may - degrade as ENCODE continues to grow. -2. The links we follow between different entity types were chosen based on non-scientific - experimentation and performance-testing. If the number of entities in a type grows - significantly, performance migth degrade / the pipeline might crash. - -### Transforming Metadata -Metadata transformation is also implemented as a Dataflow pipeline. Instead of one large -DAG, this step can be viewed as a collection of tiny processing trees that just happen to -run at the same time: -![Transformation trees](./images/encode-transform.png) - -Note that some raw entities are used to contribute to multiple output tables. + +Batch ETL pipeline to mirror ENCODE data into the Terra Data Repository (TDR). See the [architecture documentation](https://github.com/DataBiosphere/encode-ingest/blob/master/ARCHITECTURE.md) for +further design details. + +## Getting Started + +Orchestration of the ETL flows in this project is implemented using [Argo Workflows](https://argoproj.github.io/argo-workflows/). + +The core extraction and transformation data pipelines are implemented in [Scio](https://spotify.github.io/scio/) on top of Apache Beam. + +After cloning the repository, ensure you can compile the code, auto-generate schema classes +and run the test suite from the repository root: + +`sbt test` + +## Development Process + +All development should be done on branches off of the protected `master` branch. After review, merge to `master` +and then follow the instuctions in the [monster-deploy repo](https://github.com/broadinstitute/monster-deploy/) + +When modifying the Scio data pipelines, it's possible to run the pipeline locally by invoking the relevant pipeline: + +* **Extraction:** + +`sbt "encode-extraction / runMain org.broadinstitute.monster.encode.extraction.ExtractionPipeline --outputDir="` +* **Transformation** + +`sbt "encode-transformation-pipeline / runMain org.broadinstitute.monster.encode.transformation.TransformationPipeline --inputPrefix= --outputPrefix="` + +Development of Argo changes requires deployment to the DEV environment as documented in the [monster-deploy repo](https://github.com/broadinstitute/monster-deploy/) diff --git a/schema/src/main/jade-tables/biosample.table.json b/schema/src/main/jade-tables/biosample.table.json index 2c63370b..33a74899 100644 --- a/schema/src/main/jade-tables/biosample.table.json +++ b/schema/src/main/jade-tables/biosample.table.json @@ -114,6 +114,58 @@ { "name": "lot_id", "datatype": "string" + }, + { + "name": "fly_life_stage", + "datatype": "string" + }, + { + "name": "fly_synchronization_stage", + "datatype": "string" + }, + { + "name": "model_organism_age", + "datatype": "integer" + }, + { + "name": "model_organism_age_unit", + "datatype": "string" + }, + { + "name": "mouse_life_stage", + "datatype": "string" + }, + { + "name": "origin_batch", + "datatype": "string" + }, + { + "name": "passage_number", + "datatype": "integer" + }, + { + "name": "post_nucleic_acid_delivery_time", + "datatype": "integer" + }, + { + "name": "post_nucleic_acid_delivery_time_unit", + "datatype": "string" + }, + { + "name": "pulse_chase_time", + "datatype": "integer" + }, + { + "name": "pulse_chase_time_unit", + "datatype": "string" + }, + { + "name": "worm_life_stage", + "datatype": "string" + }, + { + "name": "worm_synchronization_stage", + "datatype": "string" } ] } diff --git a/schema/src/main/jade-tables/fly_donor.table.json b/schema/src/main/jade-tables/fly_donor.table.json new file mode 100644 index 00000000..abe2d24b --- /dev/null +++ b/schema/src/main/jade-tables/fly_donor.table.json @@ -0,0 +1,66 @@ +{ + "name": "fly_donor", + "columns": [ + { + "name": "id", + "datatype": "string", + "type": "primary_key" + }, + { + "name": "cross_references", + "datatype": "string", + "type": "repeated" + }, + { + "name": "time_created", + "datatype": "timestamp", + "type": "required" + }, + { + "name": "organism", + "datatype": "string", + "type": "required" + }, + { + "name": "award", + "datatype": "string", + "type": "required" + }, + { + "name": "lab", + "datatype": "string", + "type": "required" + }, + { + "name": "submitted_by", + "datatype": "string", + "type": "required" + }, + { + "name": "source", + "datatype": "string" + }, + { + "name": "genetic_modifications", + "datatype": "string", + "type": "repeated" + }, + { + "name": "genotype", + "datatype": "string" + }, + { + "name": "strain_background", + "datatype": "string" + }, + { + "name": "strain_name", + "datatype": "string" + }, + { + "name": "parent_strains", + "datatype": "string", + "type": "repeated" + } + ] +} diff --git a/schema/src/main/jade-tables/library.table.json b/schema/src/main/jade-tables/library.table.json index 3ab1731d..07d1c960 100644 --- a/schema/src/main/jade-tables/library.table.json +++ b/schema/src/main/jade-tables/library.table.json @@ -80,6 +80,39 @@ { "name": "prep_material_name", "datatype": "string" + }, + { + "name": "construction_method", + "datatype": "string" + }, + { + "name": "fragment_length_cv", + "datatype": "integer" + }, + { + "name": "fragment_length_sd", + "datatype": "integer" + }, + { + "name": "fragmentation_duration_time", + "datatype": "integer" + }, + { + "name": "fragmentation_duration_time_unit", + "datatype": "string" + }, + { + "name": "linkers", + "datatype": "string", + "type": "repeated" + }, + { + "name": "mint_mixture_identifier", + "datatype": "string" + }, + { + "name": "replicates", + "datatype": "string" } ] } diff --git a/schema/src/main/jade-tables/manatee_donor.table.json b/schema/src/main/jade-tables/manatee_donor.table.json new file mode 100644 index 00000000..558be6a8 --- /dev/null +++ b/schema/src/main/jade-tables/manatee_donor.table.json @@ -0,0 +1,65 @@ +{ + "name": "manatee_donor", + "columns": [ + { + "name": "id", + "datatype": "string", + "type": "primary_key" + }, + { + "name": "cross_references", + "datatype": "string", + "type": "repeated" + }, + { + "name": "time_created", + "datatype": "timestamp", + "type": "required" + }, + { + "name": "age_min", + "datatype": "integer" + }, + { + "name": "age_max", + "datatype": "integer" + }, + { + "name": "age_unit", + "datatype": "string" + }, + { + "name": "organism", + "datatype": "string", + "type": "required" + }, + { + "name": "award", + "datatype": "string", + "type": "required" + }, + { + "name": "lab", + "datatype": "string", + "type": "required" + }, + { + "name": "life_stage", + "datatype": "string" + }, + { + "name": "submitted_by", + "datatype": "string", + "type": "required" + }, + { + "name": "source", + "datatype": "string" + }, + { + "name": "genetic_modifications", + "datatype": "string", + "type": "repeated" + } + ] +} diff --git a/schema/src/main/jade-tables/mouse_donor.table.json b/schema/src/main/jade-tables/mouse_donor.table.json new file mode 100644 index 00000000..6b42f1cc --- /dev/null +++ b/schema/src/main/jade-tables/mouse_donor.table.json @@ -0,0 +1,66 @@ +{ + "name": "mouse_donor", + "columns": [ + { + "name": "id", + "datatype": "string", + "type": "primary_key" + }, + { + "name": "cross_references", + "datatype": "string", + "type": "repeated" + }, + { + "name": "time_created", + "datatype": "timestamp", + "type": "required" + }, + { + "name": "organism", + "datatype": "string", + "type": "required" + }, + { + "name": "award", + "datatype": "string", + "type": "required" + }, + { + "name": "lab", + "datatype": "string", + "type": "required" + }, + { + "name": "submitted_by", + "datatype": "string", + "type": "required" + }, + { + "name": "source", + "datatype": "string" + }, + { + "name": "genetic_modifications", + "datatype": "string", + "type": "repeated" + }, + { + "name": "genotype", + "datatype": "string" + }, + { + "name": "strain_background", + "datatype": "string" + }, + { + "name": "strain_name", + "datatype": "string" + }, + { + "name": "parent_strains", + "datatype": "string", + "type": "repeated" + } + ] +} diff --git a/schema/src/main/jade-tables/worm_donor.table.json b/schema/src/main/jade-tables/worm_donor.table.json new file mode 100644 index 00000000..adbe541c --- /dev/null +++ b/schema/src/main/jade-tables/worm_donor.table.json @@ -0,0 +1,82 @@ +{ + "name": "worm_donor", + "columns": [ + { + "name": "id", + "datatype": "string", + "type": "primary_key" + }, + { + "name": "cross_references", + "datatype": "string", + "type": "repeated" + }, + { + "name": "time_created", + "datatype": "timestamp", + "type": "required" + }, + { + "name": "organism", + "datatype": "string", + "type": "required" + }, + { + "name": "award", + "datatype": "string", + "type": "required" + }, + { + "name": "lab", + "datatype": "string", + "type": "required" + }, + { + "name": "submitted_by", + "datatype": "string", + "type": "required" + }, + { + "name": "source", + "datatype": "string" + }, + { + "name": "genetic_modifications", + "datatype": "string", + "type": "repeated" + }, + { + "name": "genotype", + "datatype": "string" + }, + { + "name": "num_times_outcrossed", + "datatype": "integer" + }, + { + "name": "outcrossed_strain", + "datatype": "string" + }, + { + "name": "strain_background", + "datatype": "string" + }, + { + "name": "strain_name", + "datatype": "string" + }, + { + "name": "parent_strains", + "datatype": "string", + "type": "repeated" + }, + { + "name": "url", + "datatype": "string" + }, + { + "name": "uuid", + "datatype": "string" + } + ] +} From be6d831ba1b96907fbda71913820d780eac633cc Mon Sep 17 00:00:00 2001 From: Drew Herbst Date: Wed, 13 Oct 2021 09:46:04 -0400 Subject: [PATCH 2/2] Add boilerplate transformation code for new fields --- .../BiosampleTransformations.scala | 16 +++++++++++++++- .../transformation/LibraryTransformations.scala | 11 ++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/BiosampleTransformations.scala b/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/BiosampleTransformations.scala index 01a8f80a..db7f6361 100644 --- a/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/BiosampleTransformations.scala +++ b/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/BiosampleTransformations.scala @@ -68,7 +68,21 @@ object BiosampleTransformations { None } else { lotIds.headOption - } + }, + /** TODO Implement once schema is frozen */ + flyLifeStage = Some("ignore"), + flySynchronizationStage = Some("ignore"), + modelOrganismAge = Some(1L), + modelOrganismAgeUnit = Some("ignore"), + mouseLifeStage = Some("ignore"), + originBatch = Some("ignore"), + passageNumber = Some(1L), + postNucleicAcidDeliveryTime = Some(1L), + postNucleicAcidDeliveryTimeUnit = Some("ignore"), + pulseChaseTime = Some(1L), + pulseChaseTimeUnit = Some("ignore"), + wormLifeStage = Some("ignore"), + wormSynchronizationStage = Some("ignore") ) } } diff --git a/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/LibraryTransformations.scala b/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/LibraryTransformations.scala index 61e35453..26c773e8 100644 --- a/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/LibraryTransformations.scala +++ b/transformation/src/main/scala/org/broadinstitute/monster/encode/transformation/LibraryTransformations.scala @@ -35,7 +35,16 @@ object LibraryTransformations { spikeIns = libraryInput.read[List[String]]("spikeins_used"), biosampleId = CommonTransformations.transformId(libraryInput.read[String]("biosample")), prepMaterial = libraryInput.tryRead[String]("nucleic_acid_term_id"), - prepMaterialName = libraryInput.tryRead[String]("nucleic_acid_term_name") + prepMaterialName = libraryInput.tryRead[String]("nucleic_acid_term_name"), + /** TODO Implement once schema is frozen */ + constructionMethod = Some("ignore"), + fragmentLengthCv = Some(1L), + fragmentLengthSd = Some(1L), + fragmentationDurationTime = Some(1L), + fragmentationDurationTimeUnit = Some("ignore"), + linkers = List(), + mintMixtureIdentifier = Some("ignore"), + replicates = Some("ignore") ) }