From 72f82f27612842ae455c1bceb67d919d7c544834 Mon Sep 17 00:00:00 2001 From: ollietulloch Date: Fri, 6 Dec 2024 09:57:08 +0000 Subject: [PATCH 1/3] Capture metadata from VCF header --- CHANGELOG.md | 1 + docs/vcf-file-metadata.md | 46 +++++++++++++++++++++ docs/xml-file-metadata.md | 2 +- docs/yaml-mapping-user-guide.md | 3 +- lib/ndr_import/file/vcf.rb | 25 +++++++++++ lib/ndr_import/universal_importer_helper.rb | 23 ++++++----- lib/ndr_import/vcf/table.rb | 2 +- test/file/vcf_test.rb | 16 +++++++ test/vcf/table_test.rb | 2 +- 9 files changed, 106 insertions(+), 14 deletions(-) create mode 100644 docs/vcf-file-metadata.md diff --git a/CHANGELOG.md b/CHANGELOG.md index dcc5546..3ac54f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ * Column zipping functionality * * Capturing Column name * * Regular expression column names * +* VCF file/table metadata storage * ## 11.2.1 / 2024-11-18 ### Fixed diff --git a/docs/vcf-file-metadata.md b/docs/vcf-file-metadata.md new file mode 100644 index 0000000..c543804 --- /dev/null +++ b/docs/vcf-file-metadata.md @@ -0,0 +1,46 @@ +--- +layout: page +title: VCF File Metdata +permalink: /vcf-file-metadata/ +--- + +### Introduction +VCF files contain a header storing metadata, `NdrImport::Vcf::Table` now supports retrieval and storage of that data. + +### `vcf_file_metadata` +* `NdrImport::Vcf::Table` can optionally store `vcf_file_metadata`. This is a hash of { attribute name => regular expression }. +* The `NdrImport::File::Vcf` handler uses `vcf_file_metadata` to locate the metadata from within the file, then sets the `file_metadata` attribute as a hash of { attribute name => regular expression first captured group }. +* The `UniversalImporterHelper` then assigns the handler.file_metadata to the `NdrImport::Table` attribute `table_metadata`, which can then be accessed downstream. + + +### Example: +Given the below example data: + +``` + ##contig= + ##contig= + ##contig= + ##contig= + ##contig= + ##fileDate=2023-03-29 + ##reference=file:///data/humanGenome/hs37d5.fa + ##source=Platypus_Version_0.8.1 + #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1 +1 26387783 . G A 847.77 PASS AC=1;AF=0.500;AN=2;DP=85;set=Intersection GT:AD:DP:GQ:PL:SAC 0/1:52,32:84:99:876,0,1277:21,31,14,18 +``` + +The `NdrImport::Vcf::Table` mapping might look like: + +``` +- !ruby/object:NdrImport::Vcf::Table + filename_pattern: !ruby/regexp // + vcf_file_metadata: + genome_build: /##reference=file:///data/humanGenome\/(.+)\z/ + columns: + ... +``` + +This would result in a `table_metadata` value of: +``` +{ genome_build: 'hs37d5.fa' } +``` diff --git a/docs/xml-file-metadata.md b/docs/xml-file-metadata.md index 27e66e3..0207f46 100644 --- a/docs/xml-file-metadata.md +++ b/docs/xml-file-metadata.md @@ -1,6 +1,6 @@ --- layout: page -title: XML File Netdata +title: XML File Metdata permalink: /xml-file-metadata/ --- diff --git a/docs/yaml-mapping-user-guide.md b/docs/yaml-mapping-user-guide.md index 86b76ae..22539e5 100644 --- a/docs/yaml-mapping-user-guide.md +++ b/docs/yaml-mapping-user-guide.md @@ -15,4 +15,5 @@ add_to_nav: true 8. [XML mappings](xml-mappings.md) 9. [Zipped Field Mapping](zipped-field-mapping.md) 10. [Regular Expression Column Names](regexp-column-names.md) -11. [Capturing Column Names in Mapped Data](capturing-column-names.md) \ No newline at end of file +11. [Capturing Column Names in Mapped Data](capturing-column-names.md) +12. [VCF file metadata](vcf-file-metadata.md) \ No newline at end of file diff --git a/lib/ndr_import/file/vcf.rb b/lib/ndr_import/file/vcf.rb index 1f9c94d..363187e 100644 --- a/lib/ndr_import/file/vcf.rb +++ b/lib/ndr_import/file/vcf.rb @@ -8,8 +8,33 @@ module NdrImport module File # This class is a vcf file handler that returns a single table. class Vcf < Base + attr_accessor :vcf_file_metadata + + def initialize(*) + super + + @vcf_file_metadata = @options['vcf_file_metadata'] + assign_file_metadata + end + private + def assign_file_metadata + return unless vcf_file_metadata.is_a?(Hash) + + file_metadata_hash = {} + + ::File.read(@filename).each_line do |line| + next unless line =~ /^##/ + + vcf_file_metadata.each do |attribute, pattern| + file_metadata_hash[attribute] = line.match(pattern)[1].presence if line =~ pattern + end + end + + self.file_metadata = file_metadata_hash + end + def rows(&block) return enum_for(:rows) unless block diff --git a/lib/ndr_import/universal_importer_helper.rb b/lib/ndr_import/universal_importer_helper.rb index 7d4ad77..f24a225 100644 --- a/lib/ndr_import/universal_importer_helper.rb +++ b/lib/ndr_import/universal_importer_helper.rb @@ -51,22 +51,25 @@ def extract(source_file, &block) NdrImport::File::Registry.files(source_file, 'unzip_path' => unzip_path).each do |filename| # now at the individual file level, can we find the table mapping? table_mapping = get_table_mapping(filename, nil) - - options = { 'unzip_path' => unzip_path, - 'col_sep' => table_mapping.try(:delimiter), - 'file_password' => table_mapping.try(:file_password), - 'liberal_parsing' => table_mapping.try(:liberal_parsing), - 'xml_record_xpath' => table_mapping.try(:xml_record_xpath), - 'slurp' => table_mapping.try(:slurp), - 'yield_xml_record' => table_mapping.try(:yield_xml_record), - 'pattern_match_record_xpath' => table_mapping.try(:pattern_match_record_xpath), - 'xml_file_metadata' => table_mapping.try(:xml_file_metadata) } + options = table_options_from(table_mapping).merge { 'unzip_path' => unzip_path } tables = NdrImport::File::Registry.tables(filename, table_mapping.try(:format), options) yield_tables_and_their_content(filename, tables, &block) end end + def table_options_from(table_mapping) + { 'col_sep' => table_mapping.try(:delimiter), + 'file_password' => table_mapping.try(:file_password), + 'liberal_parsing' => table_mapping.try(:liberal_parsing), + 'xml_record_xpath' => table_mapping.try(:xml_record_xpath), + 'slurp' => table_mapping.try(:slurp), + 'yield_xml_record' => table_mapping.try(:yield_xml_record), + 'pattern_match_record_xpath' => table_mapping.try(:pattern_match_record_xpath), + 'xml_file_metadata' => table_mapping.try(:xml_file_metadata), + 'vcf_file_metadata' => table_mapping.try(:vcf_file_metadata) } + end + # This method does the table row yielding for the extract method, setting the notifier # so that we can monitor progress def yield_tables_and_their_content(filename, tables, &block) diff --git a/lib/ndr_import/vcf/table.rb b/lib/ndr_import/vcf/table.rb index 1086d06..4359bf7 100644 --- a/lib/ndr_import/vcf/table.rb +++ b/lib/ndr_import/vcf/table.rb @@ -6,7 +6,7 @@ module Vcf # All other Table logic is inherited from `NdrImport::Table` class Table < ::NdrImport::Table def self.all_valid_options - super - %w[delimiter header_lines footer_lines] + super - %w[delimiter header_lines footer_lines] + %w[vcf_file_metadata] end def header_lines diff --git a/test/file/vcf_test.rb b/test/file/vcf_test.rb index 565941d..2d88c1c 100644 --- a/test/file/vcf_test.rb +++ b/test/file/vcf_test.rb @@ -17,6 +17,22 @@ def setup assert(rows.all? { |row| row.is_a? Array }) assert_equal 7, rows.to_a.length end + + test 'should read vcf file metadata' do + vcf_file_mapping_metadata = { + 'genome_build' => %r{##reference=file.*?/humanGenome/(.+)}, + 'platypus_version' => /##source=Platypus_Version_([\d.]+)/ + } + options = { 'vcf_file_metadata' => vcf_file_mapping_metadata } + handler = NdrImport::File::Vcf.new(@file_path, nil, options) + + assert_equal vcf_file_mapping_metadata, handler.vcf_file_metadata + + expected_metadata = { 'genome_build' => 'hs37d5.fa', 'platypus_version' => '0.8.1' } + assert_equal expected_metadata, handler.file_metadata + tables = handler.send(:tables).to_a + assert_equal expected_metadata, tables.first.last + end end end end diff --git a/test/vcf/table_test.rb b/test/vcf/table_test.rb index 6b0aac8..5ba78a7 100644 --- a/test/vcf/table_test.rb +++ b/test/vcf/table_test.rb @@ -13,7 +13,7 @@ def setup test 'test_all_valid_options' do valid_options = %w[canonical_name columns file_password filename_pattern format klass last_data_column liberal_parsing row_identifier - significant_mapped_fields slurp tablename_pattern] + significant_mapped_fields slurp tablename_pattern vcf_file_metadata] assert_equal valid_options.sort, NdrImport::Vcf::Table.all_valid_options.sort end From a89367cbf473dedafc6068734fd7254924afd75c Mon Sep 17 00:00:00 2001 From: ollietulloch Date: Wed, 11 Dec 2024 16:43:18 +0000 Subject: [PATCH 2/3] Use `match?` throughout --- lib/ndr_import/file/vcf.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/ndr_import/file/vcf.rb b/lib/ndr_import/file/vcf.rb index 363187e..1707c40 100644 --- a/lib/ndr_import/file/vcf.rb +++ b/lib/ndr_import/file/vcf.rb @@ -25,10 +25,10 @@ def assign_file_metadata file_metadata_hash = {} ::File.read(@filename).each_line do |line| - next unless line =~ /^##/ + next unless line.match?(/^##/) vcf_file_metadata.each do |attribute, pattern| - file_metadata_hash[attribute] = line.match(pattern)[1].presence if line =~ pattern + file_metadata_hash[attribute] = line.match(pattern)[1].presence if line.match? pattern end end @@ -39,7 +39,7 @@ def rows(&block) return enum_for(:rows) unless block ::File.read(@filename).each_line do |line| - next if line =~ /^##/ + next if line.match?(/^##/) yield BioVcf::VcfLine.parse(line) end From fdac214138159683533c598cd789ef55ec690425 Mon Sep 17 00:00:00 2001 From: ollietulloch Date: Thu, 6 Feb 2025 16:53:03 +0000 Subject: [PATCH 3/3] Documentation typo --- docs/vcf-file-metadata.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/vcf-file-metadata.md b/docs/vcf-file-metadata.md index c543804..120bc61 100644 --- a/docs/vcf-file-metadata.md +++ b/docs/vcf-file-metadata.md @@ -1,6 +1,6 @@ --- layout: page -title: VCF File Metdata +title: VCF File Metadata permalink: /vcf-file-metadata/ ---