From 5d6a2a26fb2cc33aabd1c68969e2fce4d126ab95 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Fri, 15 Oct 2021 12:14:43 +0530 Subject: [PATCH 01/29] Update preprocessing_utils.py --- bsmetadata/preprocessing_utils.py | 54 ++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/bsmetadata/preprocessing_utils.py b/bsmetadata/preprocessing_utils.py index 3041388c..004839c1 100644 --- a/bsmetadata/preprocessing_utils.py +++ b/bsmetadata/preprocessing_utils.py @@ -14,8 +14,9 @@ This script provides functions for adding different kinds of metadata to a pretraining corpus. """ from abc import ABC, abstractmethod +from collections import defaultdict from typing import Dict, List, Optional - +import requests class MetadataPreprocessor(ABC): """A metadata processor can be used for preprocessing text and adding or extracting metadata information.""" @@ -52,3 +53,54 @@ def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]: def _extract_timestamp_from_url(self, url: str) -> Optional: # This would have to be implemented. return None + + +class WebsiteDescPreprocessor(MetadataPreprocessor): + """Metadata preprocessor for adding website description based on URLs.""" + + website_description_cache = {} + org_list = ["com", "co", "org", "go", "in"] + + def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]: + + metadata_list = examples["metadata"] + + + # Iterate through the metadata associated with all examples in this batch. + for metadata in metadata_list: + # Get the URL associated with this example. + urls = [md["value"] for md in metadata if md["key"] == "url"] + + if not urls: + continue + + # Try to extract a website description from the given URL and add it to the metadata. + website_description = self._extract_website_desc_from_url(urls[0]) + + if website_description : + metadata.append({"key": "timestamp", "type": "global", "value": website_description}) + + return examples + + def _extract_website_desc_from_url(self, url: str) -> Optional: + + domain = url.str.split('/')[2] #e.g http://www.californialandcan.org/Plumas -> www.californialandcan.org + keywords = domain.str.split('.') + + keyword = keywords[-2] if len(keywords[-2])>3 else keywords[1] if (keywords[1] not in self.org_list) else keywords[0] #extracting the keyword from domain e.g. www.californialandcan.org -> californialandcan + + if keyword not in self.website_description_cache: + self.website_description_cache[keyword] = self.extract_wiki_desc(keyword) + + return self.website_description_cache[keyword] + + def extract_wiki_desc(self, keyword:str) -> Optional: + + keyword = keyword.replace(' ', '_') + r = requests.get("https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles="+ keyword + "&exintro=&exsentences=2&explaintext=&redirects=&formatversion=2&format=json") + page = r.json() + + try: + return page['query']['pages'][0]['extract'] + except: + return None From 80dcd7a7a53d6055f5722caba668eecf66183ca2 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Fri, 15 Oct 2021 12:22:50 +0530 Subject: [PATCH 02/29] Update preprocessing_utils.py --- bsmetadata/preprocessing_utils.py | 37 ++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/bsmetadata/preprocessing_utils.py b/bsmetadata/preprocessing_utils.py index 004839c1..d8ea3eea 100644 --- a/bsmetadata/preprocessing_utils.py +++ b/bsmetadata/preprocessing_utils.py @@ -16,8 +16,10 @@ from abc import ABC, abstractmethod from collections import defaultdict from typing import Dict, List, Optional + import requests + class MetadataPreprocessor(ABC): """A metadata processor can be used for preprocessing text and adding or extracting metadata information.""" @@ -64,7 +66,6 @@ class WebsiteDescPreprocessor(MetadataPreprocessor): def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]: metadata_list = examples["metadata"] - # Iterate through the metadata associated with all examples in this batch. for metadata in metadata_list: @@ -77,30 +78,40 @@ def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]: # Try to extract a website description from the given URL and add it to the metadata. website_description = self._extract_website_desc_from_url(urls[0]) - if website_description : + if website_description: metadata.append({"key": "timestamp", "type": "global", "value": website_description}) return examples def _extract_website_desc_from_url(self, url: str) -> Optional: - - domain = url.str.split('/')[2] #e.g http://www.californialandcan.org/Plumas -> www.californialandcan.org - keywords = domain.str.split('.') - keyword = keywords[-2] if len(keywords[-2])>3 else keywords[1] if (keywords[1] not in self.org_list) else keywords[0] #extracting the keyword from domain e.g. www.californialandcan.org -> californialandcan - - if keyword not in self.website_description_cache: + domain = url.str.split("/")[2] # e.g http://www.californialandcan.org/Plumas -> www.californialandcan.org + keywords = domain.str.split(".") + + keyword = ( + keywords[-2] + if len(keywords[-2]) > 3 + else keywords[1] + if (keywords[1] not in self.org_list) + else keywords[0] + ) # extracting the keyword from domain e.g. www.californialandcan.org -> californialandcan + + if keyword not in self.website_description_cache: self.website_description_cache[keyword] = self.extract_wiki_desc(keyword) return self.website_description_cache[keyword] - def extract_wiki_desc(self, keyword:str) -> Optional: - - keyword = keyword.replace(' ', '_') - r = requests.get("https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles="+ keyword + "&exintro=&exsentences=2&explaintext=&redirects=&formatversion=2&format=json") + def extract_wiki_desc(self, keyword: str) -> Optional: + + keyword = keyword.replace(" ", "_") + r = requests.get( + "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles=" + + keyword + + "&exintro=&exsentences=2&explaintext=&redirects=&formatversion=2&format=json" + ) page = r.json() try: - return page['query']['pages'][0]['extract'] + return page["query"]["pages"][0]["extract"] except: return None From 7aa17975be389a8807769ec4a9e48c954ae05489 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Mon, 1 Nov 2021 19:01:08 +0530 Subject: [PATCH 03/29] adding processor for website metadata --- .../preprocessing_tools/website_desc_utils.py | 39 +++++++++++++++ bsmetadata/preprocessing_utils.py | 49 +++++-------------- requirements.txt | 1 + 3 files changed, 52 insertions(+), 37 deletions(-) create mode 100644 bsmetadata/preprocessing_tools/website_desc_utils.py diff --git a/bsmetadata/preprocessing_tools/website_desc_utils.py b/bsmetadata/preprocessing_tools/website_desc_utils.py new file mode 100644 index 00000000..5703475b --- /dev/null +++ b/bsmetadata/preprocessing_tools/website_desc_utils.py @@ -0,0 +1,39 @@ +from collections import defaultdict +from typing import Optional + +from wikipedia2vec.dump_db import DumpDB +from urllib.parse import urlsplit + + +class WebsiteDescUtils: + def __init__(self, path_wiki_db) -> None: + self.cache = defaultdict(str) + self.wiki_dump_db = DumpDB(path_wiki_db) + self.redirects_map = {key.lower(): value for key, value in self.wiki_dump_db.redirects()} #loading all redirect information: takes ~10s + def fetch_wikipedia_title_from_keyword(self, keyword: str) -> str: + title = self.redirects_map.get( + keyword, keyword.split(".")[0].capitalize() + ) # fallback to default for cases where domain is not recognized. We'll try to hit the db with the exact keyword directly (e.g. rightmove.com -> Rightmove) Capitalizing since wikipedia titles are so + return title + + def fetch_wikipedia_description_for_title(self, title: str) -> Optional: + try: + text = self.wiki_dump_db.get_paragraphs(title)[0].text + text = '. '.join(text.split('. ')[:2]) # Picking the first two sentences from the text (Splitting on '. ' might not give the desired sentence for some corner cases but mostly works) + if not text.endswith('.'): + text+='.' + except Exception: + return None + return text + + def extract_wiki_desc(self, keyword: str) -> Optional: + + title = self.fetch_wikipedia_title_from_keyword(keyword) + desc = self.fetch_wikipedia_description_for_title(title) + return desc + + def fetch_website_description_from_keyword(self, keyword: str) -> Optional: + if not self.cache[keyword]: + self.cache[keyword] = self.extract_wiki_desc(keyword) + + return self.cache[keyword] diff --git a/bsmetadata/preprocessing_utils.py b/bsmetadata/preprocessing_utils.py index 0347c843..bef6b62f 100644 --- a/bsmetadata/preprocessing_utils.py +++ b/bsmetadata/preprocessing_utils.py @@ -14,10 +14,10 @@ This script provides functions for adding different kinds of metadata to a pretraining corpus. """ from abc import ABC, abstractmethod -from collections import defaultdict from typing import Dict, List, Optional from urllib.parse import unquote, urlsplit +from bsmetadata.preprocessing_tools.website_desc_utils import WebsiteDescUtils from bsmetadata.vendor.dateutil.src.dateutil.parser import ParserError, parse @@ -37,13 +37,16 @@ def parse_date(path): return None +def fetch_keyword_from_url(url: str) -> str: # e.g http://www.californialandcan.org/Plumas -> californialandcan.org + domain = urlsplit(url).netloc + return domain.replace("www.", "") + + def remove_improbable_date(x): if x is not None and (x.year < 1983 or x.year > 2021): return None return x -import requests - class MetadataPreprocessor(ABC): """A metadata processor can be used for preprocessing text and adding or extracting metadata information.""" @@ -85,12 +88,12 @@ def _extract_timestamp_from_url(self, url: str) -> Optional[str]: return date - class WebsiteDescPreprocessor(MetadataPreprocessor): """Metadata preprocessor for adding website description based on URLs.""" - website_description_cache = {} - org_list = ["com", "co", "org", "go", "in"] + def __init__(self, path_wiki_db: str) -> None: + self.website_utils = WebsiteDescUtils(path_wiki_db) + super().__init__() def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]: @@ -108,39 +111,11 @@ def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]: website_description = self._extract_website_desc_from_url(urls[0]) if website_description: - metadata.append({"key": "timestamp", "type": "global", "value": website_description}) - + metadata.append({"key": "website_description", "type": "global", "value": website_description}) return examples def _extract_website_desc_from_url(self, url: str) -> Optional: - domain = url.str.split("/")[2] # e.g http://www.californialandcan.org/Plumas -> www.californialandcan.org - keywords = domain.str.split(".") - - keyword = ( - keywords[-2] - if len(keywords[-2]) > 3 - else keywords[1] - if (keywords[1] not in self.org_list) - else keywords[0] - ) # extracting the keyword from domain e.g. www.californialandcan.org -> californialandcan - - if keyword not in self.website_description_cache: - self.website_description_cache[keyword] = self.extract_wiki_desc(keyword) - - return self.website_description_cache[keyword] - - def extract_wiki_desc(self, keyword: str) -> Optional: - - keyword = keyword.replace(" ", "_") - r = requests.get( - "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles=" - + keyword - + "&exintro=&exsentences=2&explaintext=&redirects=&formatversion=2&format=json" - ) - page = r.json() + keyword = fetch_keyword_from_url(url) + return self.website_utils.fetch_website_description_from_keyword(keyword) - try: - return page["query"]["pages"][0]["extract"] - except: - return None diff --git a/requirements.txt b/requirements.txt index 62be06ed..a531d72a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ wandb>=0.10.32,<1 # pip will likely update it to 0.12.1, but it is probably ok transformers>=4.6.0,<5 # pip will likely update it to 4.10.0, but it is probably ok and good for bugfixes. accelerate>=0.4.0,<1 # We may want to use 0.5.0 in the near future datasets[streaming]>=1.11.0,<2 +wikipedia2vec==1.0.5 From 561867afa25adb0578883250a659461c8c39e6aa Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Mon, 1 Nov 2021 19:01:15 +0530 Subject: [PATCH 04/29] Create download_wiki_dump.sh --- bsmetadata/preprocessing_scripts/download_wiki_dump.sh | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 bsmetadata/preprocessing_scripts/download_wiki_dump.sh diff --git a/bsmetadata/preprocessing_scripts/download_wiki_dump.sh b/bsmetadata/preprocessing_scripts/download_wiki_dump.sh new file mode 100644 index 00000000..6470cef4 --- /dev/null +++ b/bsmetadata/preprocessing_scripts/download_wiki_dump.sh @@ -0,0 +1,4 @@ + + +HUB_REPO_NAME= bs-modeling-metadata/wiki_dump +git clone https://huggingface.co/datasets/${HUB_REPO_NAME} \ No newline at end of file From 7a59ebae28008b491461d8bb97c35e4ff34a9693 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Mon, 1 Nov 2021 19:19:10 +0530 Subject: [PATCH 05/29] run style and quality checks --- .../preprocessing_tools/website_desc_utils.py | 17 +++++++++++------ bsmetadata/preprocessing_utils.py | 1 - 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/bsmetadata/preprocessing_tools/website_desc_utils.py b/bsmetadata/preprocessing_tools/website_desc_utils.py index 5703475b..530958b1 100644 --- a/bsmetadata/preprocessing_tools/website_desc_utils.py +++ b/bsmetadata/preprocessing_tools/website_desc_utils.py @@ -1,15 +1,18 @@ from collections import defaultdict from typing import Optional +from urllib.parse import urlsplit from wikipedia2vec.dump_db import DumpDB -from urllib.parse import urlsplit class WebsiteDescUtils: def __init__(self, path_wiki_db) -> None: self.cache = defaultdict(str) self.wiki_dump_db = DumpDB(path_wiki_db) - self.redirects_map = {key.lower(): value for key, value in self.wiki_dump_db.redirects()} #loading all redirect information: takes ~10s + self.redirects_map = { + key.lower(): value for key, value in self.wiki_dump_db.redirects() + } # loading all redirect information: takes ~10s + def fetch_wikipedia_title_from_keyword(self, keyword: str) -> str: title = self.redirects_map.get( keyword, keyword.split(".")[0].capitalize() @@ -18,10 +21,12 @@ def fetch_wikipedia_title_from_keyword(self, keyword: str) -> str: def fetch_wikipedia_description_for_title(self, title: str) -> Optional: try: - text = self.wiki_dump_db.get_paragraphs(title)[0].text - text = '. '.join(text.split('. ')[:2]) # Picking the first two sentences from the text (Splitting on '. ' might not give the desired sentence for some corner cases but mostly works) - if not text.endswith('.'): - text+='.' + text = self.wiki_dump_db.get_paragraphs(title)[0].text + text = ". ".join( + text.split(". ")[:2] + ) # Picking the first two sentences from the text (Splitting on '. ' might not give the desired sentence for some corner cases but mostly works) + if not text.endswith("."): + text += "." except Exception: return None return text diff --git a/bsmetadata/preprocessing_utils.py b/bsmetadata/preprocessing_utils.py index bef6b62f..e239b6cd 100644 --- a/bsmetadata/preprocessing_utils.py +++ b/bsmetadata/preprocessing_utils.py @@ -118,4 +118,3 @@ def _extract_website_desc_from_url(self, url: str) -> Optional: keyword = fetch_keyword_from_url(url) return self.website_utils.fetch_website_description_from_keyword(keyword) - From fce9980f9a36b18cdf09b3f189549271eee6337a Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Mon, 1 Nov 2021 19:28:19 +0530 Subject: [PATCH 06/29] Update website_desc_utils.py --- bsmetadata/preprocessing_tools/website_desc_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bsmetadata/preprocessing_tools/website_desc_utils.py b/bsmetadata/preprocessing_tools/website_desc_utils.py index 530958b1..3f316999 100644 --- a/bsmetadata/preprocessing_tools/website_desc_utils.py +++ b/bsmetadata/preprocessing_tools/website_desc_utils.py @@ -1,6 +1,5 @@ from collections import defaultdict from typing import Optional -from urllib.parse import urlsplit from wikipedia2vec.dump_db import DumpDB From ab7bd721aa5cfda959eb87166e2dee19c3bd07c6 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Mon, 1 Nov 2021 21:48:45 +0530 Subject: [PATCH 07/29] adding tokenization for sentence --- bsmetadata/preprocessing_scripts/download_wiki_dump.sh | 6 +++++- bsmetadata/preprocessing_tools/website_desc_utils.py | 7 ++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/bsmetadata/preprocessing_scripts/download_wiki_dump.sh b/bsmetadata/preprocessing_scripts/download_wiki_dump.sh index 6470cef4..73e5db9d 100644 --- a/bsmetadata/preprocessing_scripts/download_wiki_dump.sh +++ b/bsmetadata/preprocessing_scripts/download_wiki_dump.sh @@ -1,4 +1,8 @@ HUB_REPO_NAME= bs-modeling-metadata/wiki_dump -git clone https://huggingface.co/datasets/${HUB_REPO_NAME} \ No newline at end of file +git clone https://huggingface.co/datasets/${HUB_REPO_NAME} + + +# Downloading nltk punkt to be used in sentence tokenizer +python -m nltk.downloader 'punkt' \ No newline at end of file diff --git a/bsmetadata/preprocessing_tools/website_desc_utils.py b/bsmetadata/preprocessing_tools/website_desc_utils.py index 3f316999..3bc30213 100644 --- a/bsmetadata/preprocessing_tools/website_desc_utils.py +++ b/bsmetadata/preprocessing_tools/website_desc_utils.py @@ -1,6 +1,7 @@ from collections import defaultdict from typing import Optional +import nltk from wikipedia2vec.dump_db import DumpDB @@ -21,11 +22,7 @@ def fetch_wikipedia_title_from_keyword(self, keyword: str) -> str: def fetch_wikipedia_description_for_title(self, title: str) -> Optional: try: text = self.wiki_dump_db.get_paragraphs(title)[0].text - text = ". ".join( - text.split(". ")[:2] - ) # Picking the first two sentences from the text (Splitting on '. ' might not give the desired sentence for some corner cases but mostly works) - if not text.endswith("."): - text += "." + text = nltk.sent_tokenize(text)[0] # Picking the first sentence except Exception: return None return text From 60e6620dd3b834ef2f3d4b35a2ab69474e4674ec Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Mon, 1 Nov 2021 22:49:37 +0530 Subject: [PATCH 08/29] Update download_wiki_dump.sh --- bsmetadata/preprocessing_scripts/download_wiki_dump.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bsmetadata/preprocessing_scripts/download_wiki_dump.sh b/bsmetadata/preprocessing_scripts/download_wiki_dump.sh index 73e5db9d..7abb8d83 100644 --- a/bsmetadata/preprocessing_scripts/download_wiki_dump.sh +++ b/bsmetadata/preprocessing_scripts/download_wiki_dump.sh @@ -1,6 +1,6 @@ -HUB_REPO_NAME= bs-modeling-metadata/wiki_dump +HUB_REPO_NAME=bs-modeling-metadata/wiki_dump git clone https://huggingface.co/datasets/${HUB_REPO_NAME} From 0299c8886277cd0f2633ff8018270471a35935c3 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 00:07:55 +0530 Subject: [PATCH 09/29] add test --- tests/mocks/dump_db.py | 12 ++++++++++++ tests/test_preprocessing_utils.py | 20 ++++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 tests/mocks/dump_db.py create mode 100644 tests/test_preprocessing_utils.py diff --git a/tests/mocks/dump_db.py b/tests/mocks/dump_db.py new file mode 100644 index 00000000..a57b9ed7 --- /dev/null +++ b/tests/mocks/dump_db.py @@ -0,0 +1,12 @@ +from typing import List + + +class DumpDB: + def __init__(self, path_to_dump_db="") -> None: + return + + def redirects(self) -> List[tuple]: + return [{"a": "A"}, {"test.com", "Test"}] + + def get_paragraphs(self, title: str): + return [{"text": "Paragraph 1"}, {"text": "Paragraph 2"}] diff --git a/tests/test_preprocessing_utils.py b/tests/test_preprocessing_utils.py new file mode 100644 index 00000000..6246dafd --- /dev/null +++ b/tests/test_preprocessing_utils.py @@ -0,0 +1,20 @@ +import unittest +from unittest import mock + +from mocks.dump_db import DumpDB + +from bsmetadata.preprocessing_utils import WebsiteDescPreprocessor + + +class WebsiteDescPreprocessorTester(unittest.TestCase): + def setUp(self) -> None: + self.html_processor = WebsiteDescPreprocessor() + + @mock.patch("bsmetadata.preprocessing_utils.DumpDB") + def test_website_preprocessing(self, mock_db): + mock_db.return_value = DumpDB + print(mock_db) + + +if __name__ == "__main__": + unittest.main() From 5a3f0f1492554e1b4ddbb026965593900deaf19a Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 00:17:34 +0530 Subject: [PATCH 10/29] Update preprocessing_utils.py --- bsmetadata/preprocessing_utils.py | 38 ++++++++++++++----------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/bsmetadata/preprocessing_utils.py b/bsmetadata/preprocessing_utils.py index e239b6cd..bb11230b 100644 --- a/bsmetadata/preprocessing_utils.py +++ b/bsmetadata/preprocessing_utils.py @@ -18,7 +18,7 @@ from urllib.parse import unquote, urlsplit from bsmetadata.preprocessing_tools.website_desc_utils import WebsiteDescUtils -from bsmetadata.vendor.dateutil.src.dateutil.parser import ParserError, parse +# from bsmetadata.vendor.dateutil.src.dateutil.parser import ParserError, parse def get_path_from_url(url): @@ -27,25 +27,25 @@ def get_path_from_url(url): return unquote(parts.path) -def parse_date(path): - try: - return parse(path, fuzzy=True, date_only=True) - except ParserError: - return None - except OverflowError: - # this happens sometimes, I don't know why, just ignore it - return None +# def parse_date(path): +# try: +# return parse(path, fuzzy=True, date_only=True) +# except ParserError: +# return None +# except OverflowError: +# # this happens sometimes, I don't know why, just ignore it +# return None -def fetch_keyword_from_url(url: str) -> str: # e.g http://www.californialandcan.org/Plumas -> californialandcan.org - domain = urlsplit(url).netloc - return domain.replace("www.", "") +# def fetch_keyword_from_url(url: str) -> str: # e.g http://www.californialandcan.org/Plumas -> californialandcan.org +# domain = urlsplit(url).netloc +# return domain.replace("www.", "") -def remove_improbable_date(x): - if x is not None and (x.year < 1983 or x.year > 2021): - return None - return x +# def remove_improbable_date(x): +# if x is not None and (x.year < 1983 or x.year > 2021): +# return None +# return x class MetadataPreprocessor(ABC): @@ -81,11 +81,7 @@ def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]: return examples def _extract_timestamp_from_url(self, url: str) -> Optional[str]: - path = get_path_from_url(url) - date = parse_date(path) - date = remove_improbable_date(date) - date = str(date) if date is not None else None - return date + return None class WebsiteDescPreprocessor(MetadataPreprocessor): From 2bf8ca2cf7b0b939aa059b21bcca6d24b37da269 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 00:27:40 +0530 Subject: [PATCH 11/29] Update preprocessing_utils.py --- bsmetadata/preprocessing_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bsmetadata/preprocessing_utils.py b/bsmetadata/preprocessing_utils.py index bb11230b..5756022b 100644 --- a/bsmetadata/preprocessing_utils.py +++ b/bsmetadata/preprocessing_utils.py @@ -87,7 +87,7 @@ def _extract_timestamp_from_url(self, url: str) -> Optional[str]: class WebsiteDescPreprocessor(MetadataPreprocessor): """Metadata preprocessor for adding website description based on URLs.""" - def __init__(self, path_wiki_db: str) -> None: + def __init__(self, path_wiki_db: str = "../preprocessing_data/wiki_dump/wiki_en_dump_db") -> None: self.website_utils = WebsiteDescUtils(path_wiki_db) super().__init__() From 1ef9493f900c803f15b427695b1946c1593a82ab Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 00:42:01 +0530 Subject: [PATCH 12/29] Update test_preprocessing_utils.py --- tests/test_preprocessing_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_preprocessing_utils.py b/tests/test_preprocessing_utils.py index 6246dafd..0b0eaa32 100644 --- a/tests/test_preprocessing_utils.py +++ b/tests/test_preprocessing_utils.py @@ -8,7 +8,7 @@ class WebsiteDescPreprocessorTester(unittest.TestCase): def setUp(self) -> None: - self.html_processor = WebsiteDescPreprocessor() + self.website_processor = WebsiteDescPreprocessor("some/path") @mock.patch("bsmetadata.preprocessing_utils.DumpDB") def test_website_preprocessing(self, mock_db): From 19ff0348a73e93369532650feb48fe77c69dbd7c Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 00:45:29 +0530 Subject: [PATCH 13/29] Update test_preprocessing_utils.py --- tests/test_preprocessing_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_preprocessing_utils.py b/tests/test_preprocessing_utils.py index 0b0eaa32..6dbcde88 100644 --- a/tests/test_preprocessing_utils.py +++ b/tests/test_preprocessing_utils.py @@ -7,8 +7,8 @@ class WebsiteDescPreprocessorTester(unittest.TestCase): - def setUp(self) -> None: - self.website_processor = WebsiteDescPreprocessor("some/path") + # def setUp(self) -> None: + # self.website_processor = WebsiteDescPreprocessor("some/path") @mock.patch("bsmetadata.preprocessing_utils.DumpDB") def test_website_preprocessing(self, mock_db): From a6761db4330efa950f8c383e5e6aa597cc0e765d Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 02:58:46 +0530 Subject: [PATCH 14/29] adding tests --- tests/mocks/mock_dump_db.py | 31 ++++++++++++++++++++++++ tests/test_preprocessing_utils.py | 40 +++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 tests/mocks/mock_dump_db.py create mode 100644 tests/test_preprocessing_utils.py diff --git a/tests/mocks/mock_dump_db.py b/tests/mocks/mock_dump_db.py new file mode 100644 index 00000000..d34622b4 --- /dev/null +++ b/tests/mocks/mock_dump_db.py @@ -0,0 +1,31 @@ +from typing import List + + +class MockParagraph: + def __init__(self, text): + self.text = text + + +class MockDumpDB: + def __init__(self, db_file) -> None: + self.db_file = db_file + + def redirects(self) -> List[tuple]: + return [("xyz.com", "XYZ"), ("test.com", "Test"), ("test_key", "Test Key")] + + def get_paragraphs(self, title: str): + paragraphs_map = { + "XYZ": [ + MockParagraph("XYZ is a U.S. based company."), + MockParagraph("Test paragraph for the key XYZ."), + ], + "Test": [ + MockParagraph("Test is a U.S. based company."), + MockParagraph("Test paragraph for the key Test."), + ], + "Sometitle": [ + MockParagraph("SomeTitle is a U.S. based company."), + MockParagraph("Test paragraph for the key SomeTitle."), + ], + } + return paragraphs_map[title] diff --git a/tests/test_preprocessing_utils.py b/tests/test_preprocessing_utils.py new file mode 100644 index 00000000..7dea688c --- /dev/null +++ b/tests/test_preprocessing_utils.py @@ -0,0 +1,40 @@ +import unittest +from unittest import mock + +from datasets import Dataset +from mocks.mock_dump_db import MockDumpDB + +from bsmetadata.preprocessing_utils import WebsiteDescPreprocessor + + +def mock_sent_tokenize(text): + return [text] + + +class WebsiteDescPreprocessorTester(unittest.TestCase): + @mock.patch("mocks.website_desc_utils.DumpDB") + def setUp(self, mock_db) -> None: + mock_db.return_value = MockDumpDB("some/path") + self.website_processor = WebsiteDescPreprocessor("some/path") + self.example_ids = [0, 1, 2] + self.example_text = ["test text 1", "test text 2", "test text 3"] + self.example_metadata = [ + [{"key": "url", "type": "global", "value": "https://www.xyz.com"}], + [{"key": "url", "type": "global", "value": "http://sometitle.com"}], + [{"key": "url", "type": "global", "value": "http://www.sometitle.com"}], + [{"key": "url", "type": "global", "value": "https://www.test.com"}], + ] + + self.example_dict = {"id": self.example_ids, "metadata": self.example_metadata, "text": self.example_text} + + @mock.patch("mocks.website_desc_utils.nltk.sent_tokenize", new=mock_sent_tokenize) + def test_website_metadata_processor(self): + ds = Dataset.from_dict(self.example_dict) + ds = ds.map(lambda ex: self.website_processor.preprocess(ex), batched=True) + + target_metadata = ["XYZ is a U.S. based company. Another test line."] + self.assertEqual(ds[:]["metadata"], target_metadata) + + +if __name__ == "__main__": + unittest.main() From 9afa782d392241518f4737d4e5aca2c2fb7c04b4 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 03:01:54 +0530 Subject: [PATCH 15/29] fixing a bug in mocking --- tests/test_preprocessing_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_preprocessing_utils.py b/tests/test_preprocessing_utils.py index 7dea688c..27fa90ef 100644 --- a/tests/test_preprocessing_utils.py +++ b/tests/test_preprocessing_utils.py @@ -12,7 +12,7 @@ def mock_sent_tokenize(text): class WebsiteDescPreprocessorTester(unittest.TestCase): - @mock.patch("mocks.website_desc_utils.DumpDB") + @mock.patch("bsmetadata.website_desc_utils.DumpDB") def setUp(self, mock_db) -> None: mock_db.return_value = MockDumpDB("some/path") self.website_processor = WebsiteDescPreprocessor("some/path") From 46773a987efa51e4d491a59d3a8d38d916a7e49f Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 03:13:20 +0530 Subject: [PATCH 16/29] Update test.yml --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3a7dc9de..28d3902e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -29,3 +29,4 @@ jobs: run: | python -m pytest tests/test_get_dataloaders.py python -m pytest tests/test_metadata_utils.py + python -m pytest tests/preprocessing_utils.py From 8cd384d57d06c50fde051ffc43f6e94656df0456 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 03:23:11 +0530 Subject: [PATCH 17/29] updating name in workflow --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 28d3902e..12419168 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -29,4 +29,4 @@ jobs: run: | python -m pytest tests/test_get_dataloaders.py python -m pytest tests/test_metadata_utils.py - python -m pytest tests/preprocessing_utils.py + python -m pytest tests/test_preprocessing_utils.py From e6e03425f7dd73f5f74d25ec08ef132931d4f96a Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 03:34:39 +0530 Subject: [PATCH 18/29] adding nltk to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index a531d72a..38f5ad91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ transformers>=4.6.0,<5 # pip will likely update it to 4.10.0, but it is probabl accelerate>=0.4.0,<1 # We may want to use 0.5.0 in the near future datasets[streaming]>=1.11.0,<2 wikipedia2vec==1.0.5 +nltk==3.6.5 From a82cd7896a5c6e142ad35aa397bc8e94dd2eae49 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 04:02:31 +0530 Subject: [PATCH 19/29] Update test_preprocessing_utils.py --- tests/test_preprocessing_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_preprocessing_utils.py b/tests/test_preprocessing_utils.py index 27fa90ef..079e526b 100644 --- a/tests/test_preprocessing_utils.py +++ b/tests/test_preprocessing_utils.py @@ -12,7 +12,7 @@ def mock_sent_tokenize(text): class WebsiteDescPreprocessorTester(unittest.TestCase): - @mock.patch("bsmetadata.website_desc_utils.DumpDB") + @mock.patch("bsmetadata.preprocessing_tools.website_desc_utils.DumpDB") def setUp(self, mock_db) -> None: mock_db.return_value = MockDumpDB("some/path") self.website_processor = WebsiteDescPreprocessor("some/path") From 8160bda1b96034c9dcf73f68c74c5236a636cd98 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 04:10:03 +0530 Subject: [PATCH 20/29] Update test_preprocessing_utils.py --- tests/test_preprocessing_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_preprocessing_utils.py b/tests/test_preprocessing_utils.py index 079e526b..a237aad9 100644 --- a/tests/test_preprocessing_utils.py +++ b/tests/test_preprocessing_utils.py @@ -27,7 +27,7 @@ def setUp(self, mock_db) -> None: self.example_dict = {"id": self.example_ids, "metadata": self.example_metadata, "text": self.example_text} - @mock.patch("mocks.website_desc_utils.nltk.sent_tokenize", new=mock_sent_tokenize) + @mock.patch("bsmetadata.preprocessing_tools.website_desc_utils.nltk.sent_tokenize", new=mock_sent_tokenize) def test_website_metadata_processor(self): ds = Dataset.from_dict(self.example_dict) ds = ds.map(lambda ex: self.website_processor.preprocess(ex), batched=True) From f8c05e266168731dfbfa172bbf1e09ef32876252 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 04:20:19 +0530 Subject: [PATCH 21/29] fixing tests --- bsmetadata/preprocessing_utils.py | 8 +++++--- tests/test_preprocessing_utils.py | 26 ++++++++++++++++++++++---- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/bsmetadata/preprocessing_utils.py b/bsmetadata/preprocessing_utils.py index 5756022b..8d170f43 100644 --- a/bsmetadata/preprocessing_utils.py +++ b/bsmetadata/preprocessing_utils.py @@ -18,6 +18,8 @@ from urllib.parse import unquote, urlsplit from bsmetadata.preprocessing_tools.website_desc_utils import WebsiteDescUtils + + # from bsmetadata.vendor.dateutil.src.dateutil.parser import ParserError, parse @@ -37,9 +39,9 @@ def get_path_from_url(url): # return None -# def fetch_keyword_from_url(url: str) -> str: # e.g http://www.californialandcan.org/Plumas -> californialandcan.org -# domain = urlsplit(url).netloc -# return domain.replace("www.", "") +def fetch_keyword_from_url(url: str) -> str: # e.g http://www.californialandcan.org/Plumas -> californialandcan.org + domain = urlsplit(url).netloc + return domain.replace("www.", "") # def remove_improbable_date(x): diff --git a/tests/test_preprocessing_utils.py b/tests/test_preprocessing_utils.py index a237aad9..fda26b7a 100644 --- a/tests/test_preprocessing_utils.py +++ b/tests/test_preprocessing_utils.py @@ -20,8 +20,10 @@ def setUp(self, mock_db) -> None: self.example_text = ["test text 1", "test text 2", "test text 3"] self.example_metadata = [ [{"key": "url", "type": "global", "value": "https://www.xyz.com"}], - [{"key": "url", "type": "global", "value": "http://sometitle.com"}], - [{"key": "url", "type": "global", "value": "http://www.sometitle.com"}], + [ + {"key": "url", "type": "global", "value": "http://sometitle.com"}, + {"key": "url", "type": "global", "value": "http://notfound.com"}, + ], [{"key": "url", "type": "global", "value": "https://www.test.com"}], ] @@ -31,10 +33,26 @@ def setUp(self, mock_db) -> None: def test_website_metadata_processor(self): ds = Dataset.from_dict(self.example_dict) ds = ds.map(lambda ex: self.website_processor.preprocess(ex), batched=True) - - target_metadata = ["XYZ is a U.S. based company. Another test line."] + target_metadata = [ + [ + {"key": "url", "type": "global", "value": "https://www.xyz.com"}, + {"key": "website_description", "type": "global", "value": "XYZ is a U.S. based company."}, + ], + [ + {"key": "url", "type": "global", "value": "http://sometitle.com"}, + {"key": "url", "type": "global", "value": "http://notfound.com"}, + {"key": "website_description", "type": "global", "value": "SomeTitle is a U.S. based company."}, + ], + [ + {"key": "url", "type": "global", "value": "https://www.test.com"}, + {"key": "website_description", "type": "global", "value": "Test is a U.S. based company."}, + ], + ] self.assertEqual(ds[:]["metadata"], target_metadata) + # target_metadata = ["XYZ is a U.S. based company. Another test line."] + # self.assertEqual(ds[:]["metadata"], target_metadata) + if __name__ == "__main__": unittest.main() From 7ea07b4c1456c0747e94f76d62fd0b292af4fadf Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 04:22:14 +0530 Subject: [PATCH 22/29] reverting changes from test --- bsmetadata/preprocessing_utils.py | 32 +++++++++++++++++-------------- tests/test_preprocessing_utils.py | 2 -- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/bsmetadata/preprocessing_utils.py b/bsmetadata/preprocessing_utils.py index 8d170f43..688784c2 100644 --- a/bsmetadata/preprocessing_utils.py +++ b/bsmetadata/preprocessing_utils.py @@ -20,7 +20,7 @@ from bsmetadata.preprocessing_tools.website_desc_utils import WebsiteDescUtils -# from bsmetadata.vendor.dateutil.src.dateutil.parser import ParserError, parse +from bsmetadata.vendor.dateutil.src.dateutil.parser import ParserError, parse def get_path_from_url(url): @@ -29,14 +29,14 @@ def get_path_from_url(url): return unquote(parts.path) -# def parse_date(path): -# try: -# return parse(path, fuzzy=True, date_only=True) -# except ParserError: -# return None -# except OverflowError: -# # this happens sometimes, I don't know why, just ignore it -# return None +def parse_date(path): + try: + return parse(path, fuzzy=True, date_only=True) + except ParserError: + return None + except OverflowError: + # this happens sometimes, I don't know why, just ignore it + return None def fetch_keyword_from_url(url: str) -> str: # e.g http://www.californialandcan.org/Plumas -> californialandcan.org @@ -44,10 +44,10 @@ def fetch_keyword_from_url(url: str) -> str: # e.g http://www.californialandcan return domain.replace("www.", "") -# def remove_improbable_date(x): -# if x is not None and (x.year < 1983 or x.year > 2021): -# return None -# return x +def remove_improbable_date(x): + if x is not None and (x.year < 1983 or x.year > 2021): + return None + return x class MetadataPreprocessor(ABC): @@ -83,7 +83,11 @@ def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]: return examples def _extract_timestamp_from_url(self, url: str) -> Optional[str]: - return None + path = get_path_from_url(url) + date = parse_date(path) + date = remove_improbable_date(date) + date = str(date) if date is not None else None + return date class WebsiteDescPreprocessor(MetadataPreprocessor): diff --git a/tests/test_preprocessing_utils.py b/tests/test_preprocessing_utils.py index fda26b7a..42292150 100644 --- a/tests/test_preprocessing_utils.py +++ b/tests/test_preprocessing_utils.py @@ -50,8 +50,6 @@ def test_website_metadata_processor(self): ] self.assertEqual(ds[:]["metadata"], target_metadata) - # target_metadata = ["XYZ is a U.S. based company. Another test line."] - # self.assertEqual(ds[:]["metadata"], target_metadata) if __name__ == "__main__": From 7ef1d9a59b904bd93f5137828c522ec2825e2fb7 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 04:22:58 +0530 Subject: [PATCH 23/29] fixing quality --- bsmetadata/preprocessing_utils.py | 2 -- tests/test_preprocessing_utils.py | 1 - 2 files changed, 3 deletions(-) diff --git a/bsmetadata/preprocessing_utils.py b/bsmetadata/preprocessing_utils.py index 688784c2..54bdadde 100644 --- a/bsmetadata/preprocessing_utils.py +++ b/bsmetadata/preprocessing_utils.py @@ -18,8 +18,6 @@ from urllib.parse import unquote, urlsplit from bsmetadata.preprocessing_tools.website_desc_utils import WebsiteDescUtils - - from bsmetadata.vendor.dateutil.src.dateutil.parser import ParserError, parse diff --git a/tests/test_preprocessing_utils.py b/tests/test_preprocessing_utils.py index 42292150..794c8ce7 100644 --- a/tests/test_preprocessing_utils.py +++ b/tests/test_preprocessing_utils.py @@ -51,6 +51,5 @@ def test_website_metadata_processor(self): self.assertEqual(ds[:]["metadata"], target_metadata) - if __name__ == "__main__": unittest.main() From b2511097d9f05afe09222857d737d3845f576600 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 12:04:13 +0530 Subject: [PATCH 24/29] modifying script and deleting extra file --- .github/workflows/test.yml | 1 - .../preprocessing_scripts/download_wiki_dump.sh | 8 ++++++-- tests/mocks/dump_db.py | 12 ------------ tests/mocks/mock_dump_db.py | 15 ++++++++------- tests/test_preprocessing_utils.py | 2 +- 5 files changed, 15 insertions(+), 23 deletions(-) delete mode 100644 tests/mocks/dump_db.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 12419168..3a7dc9de 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -29,4 +29,3 @@ jobs: run: | python -m pytest tests/test_get_dataloaders.py python -m pytest tests/test_metadata_utils.py - python -m pytest tests/test_preprocessing_utils.py diff --git a/bsmetadata/preprocessing_scripts/download_wiki_dump.sh b/bsmetadata/preprocessing_scripts/download_wiki_dump.sh index 7abb8d83..80583402 100644 --- a/bsmetadata/preprocessing_scripts/download_wiki_dump.sh +++ b/bsmetadata/preprocessing_scripts/download_wiki_dump.sh @@ -1,8 +1,12 @@ +${1:-../preprocessing_data} # default director: preprocessing_data + +## Clone the huggingface dataset repo containing wiki dump +mkdir $1 HUB_REPO_NAME=bs-modeling-metadata/wiki_dump -git clone https://huggingface.co/datasets/${HUB_REPO_NAME} +git clone https://huggingface.co/datasets/${HUB_REPO_NAME} $1 -# Downloading nltk punkt to be used in sentence tokenizer +## Downloading nltk punkt to be used in sentence tokenizer python -m nltk.downloader 'punkt' \ No newline at end of file diff --git a/tests/mocks/dump_db.py b/tests/mocks/dump_db.py deleted file mode 100644 index a57b9ed7..00000000 --- a/tests/mocks/dump_db.py +++ /dev/null @@ -1,12 +0,0 @@ -from typing import List - - -class DumpDB: - def __init__(self, path_to_dump_db="") -> None: - return - - def redirects(self) -> List[tuple]: - return [{"a": "A"}, {"test.com", "Test"}] - - def get_paragraphs(self, title: str): - return [{"text": "Paragraph 1"}, {"text": "Paragraph 2"}] diff --git a/tests/mocks/mock_dump_db.py b/tests/mocks/mock_dump_db.py index d34622b4..a500c65b 100644 --- a/tests/mocks/mock_dump_db.py +++ b/tests/mocks/mock_dump_db.py @@ -9,12 +9,8 @@ def __init__(self, text): class MockDumpDB: def __init__(self, db_file) -> None: self.db_file = db_file - - def redirects(self) -> List[tuple]: - return [("xyz.com", "XYZ"), ("test.com", "Test"), ("test_key", "Test Key")] - - def get_paragraphs(self, title: str): - paragraphs_map = { + self.redirect_info = [("xyz.com", "XYZ"), ("test.com", "Test"), ("test_key", "Test Key")] + self.paragraphs_map = { "XYZ": [ MockParagraph("XYZ is a U.S. based company."), MockParagraph("Test paragraph for the key XYZ."), @@ -28,4 +24,9 @@ def get_paragraphs(self, title: str): MockParagraph("Test paragraph for the key SomeTitle."), ], } - return paragraphs_map[title] + + def redirects(self) -> List[tuple]: + return self.redirect_info + + def get_paragraphs(self, title: str): + return self.paragraphs_map[title] diff --git a/tests/test_preprocessing_utils.py b/tests/test_preprocessing_utils.py index 794c8ce7..2061f4e2 100644 --- a/tests/test_preprocessing_utils.py +++ b/tests/test_preprocessing_utils.py @@ -15,7 +15,7 @@ class WebsiteDescPreprocessorTester(unittest.TestCase): @mock.patch("bsmetadata.preprocessing_tools.website_desc_utils.DumpDB") def setUp(self, mock_db) -> None: mock_db.return_value = MockDumpDB("some/path") - self.website_processor = WebsiteDescPreprocessor("some/path") + self.website_processor = WebsiteDescPreprocessor() self.example_ids = [0, 1, 2] self.example_text = ["test text 1", "test text 2", "test text 3"] self.example_metadata = [ From ca9c9d06d2f1bef5f6c458115cfb19499dd7f498 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 12:10:01 +0530 Subject: [PATCH 25/29] Update preprocessing_utils.py --- bsmetadata/preprocessing_utils.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/bsmetadata/preprocessing_utils.py b/bsmetadata/preprocessing_utils.py index 54bdadde..8728e0b7 100644 --- a/bsmetadata/preprocessing_utils.py +++ b/bsmetadata/preprocessing_utils.py @@ -18,7 +18,7 @@ from urllib.parse import unquote, urlsplit from bsmetadata.preprocessing_tools.website_desc_utils import WebsiteDescUtils -from bsmetadata.vendor.dateutil.src.dateutil.parser import ParserError, parse +# from bsmetadata.vendor.dateutil.src.dateutil.parser import ParserError, parse def get_path_from_url(url): @@ -27,14 +27,14 @@ def get_path_from_url(url): return unquote(parts.path) -def parse_date(path): - try: - return parse(path, fuzzy=True, date_only=True) - except ParserError: - return None - except OverflowError: - # this happens sometimes, I don't know why, just ignore it - return None +# def parse_date(path): +# try: +# return parse(path, fuzzy=True, date_only=True) +# except ParserError: +# return None +# except OverflowError: +# # this happens sometimes, I don't know why, just ignore it +# return None def fetch_keyword_from_url(url: str) -> str: # e.g http://www.californialandcan.org/Plumas -> californialandcan.org @@ -82,8 +82,8 @@ def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]: def _extract_timestamp_from_url(self, url: str) -> Optional[str]: path = get_path_from_url(url) - date = parse_date(path) - date = remove_improbable_date(date) + # date = parse_date(path) + date = remove_improbable_date(path) date = str(date) if date is not None else None return date From fe9a22825a8c0dbba4be87569fe96d79da1f1870 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 12:47:35 +0530 Subject: [PATCH 26/29] Update download_wiki_dump.sh --- bsmetadata/preprocessing_scripts/download_wiki_dump.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bsmetadata/preprocessing_scripts/download_wiki_dump.sh b/bsmetadata/preprocessing_scripts/download_wiki_dump.sh index 80583402..db01be0f 100644 --- a/bsmetadata/preprocessing_scripts/download_wiki_dump.sh +++ b/bsmetadata/preprocessing_scripts/download_wiki_dump.sh @@ -1,11 +1,11 @@ -${1:-../preprocessing_data} # default director: preprocessing_data +out_dir=${1:-bsmetadata/preprocessing_data} # default director: preprocessing_data ## Clone the huggingface dataset repo containing wiki dump -mkdir $1 +mkdir -p "$out_dir" HUB_REPO_NAME=bs-modeling-metadata/wiki_dump -git clone https://huggingface.co/datasets/${HUB_REPO_NAME} $1 +git clone https://huggingface.co/datasets/${HUB_REPO_NAME} $out_dir/wiki_dump ## Downloading nltk punkt to be used in sentence tokenizer From e185fbec0c39672c971fcef55059196afbacc343 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 2 Nov 2021 12:51:29 +0530 Subject: [PATCH 27/29] Update preprocessing_utils.py --- bsmetadata/preprocessing_utils.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/bsmetadata/preprocessing_utils.py b/bsmetadata/preprocessing_utils.py index 8728e0b7..54bdadde 100644 --- a/bsmetadata/preprocessing_utils.py +++ b/bsmetadata/preprocessing_utils.py @@ -18,7 +18,7 @@ from urllib.parse import unquote, urlsplit from bsmetadata.preprocessing_tools.website_desc_utils import WebsiteDescUtils -# from bsmetadata.vendor.dateutil.src.dateutil.parser import ParserError, parse +from bsmetadata.vendor.dateutil.src.dateutil.parser import ParserError, parse def get_path_from_url(url): @@ -27,14 +27,14 @@ def get_path_from_url(url): return unquote(parts.path) -# def parse_date(path): -# try: -# return parse(path, fuzzy=True, date_only=True) -# except ParserError: -# return None -# except OverflowError: -# # this happens sometimes, I don't know why, just ignore it -# return None +def parse_date(path): + try: + return parse(path, fuzzy=True, date_only=True) + except ParserError: + return None + except OverflowError: + # this happens sometimes, I don't know why, just ignore it + return None def fetch_keyword_from_url(url: str) -> str: # e.g http://www.californialandcan.org/Plumas -> californialandcan.org @@ -82,8 +82,8 @@ def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]: def _extract_timestamp_from_url(self, url: str) -> Optional[str]: path = get_path_from_url(url) - # date = parse_date(path) - date = remove_improbable_date(path) + date = parse_date(path) + date = remove_improbable_date(date) date = str(date) if date is not None else None return date From 81af40a9ff92dedda0e1ca3d5a5cda5afaa411b8 Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 23 Nov 2021 15:10:09 +0530 Subject: [PATCH 28/29] addressing PR comments --- bsmetadata/preprocessing_tools/website_desc_utils.py | 2 ++ bsmetadata/preprocessing_utils.py | 1 + setup.py | 1 + 3 files changed, 4 insertions(+) diff --git a/bsmetadata/preprocessing_tools/website_desc_utils.py b/bsmetadata/preprocessing_tools/website_desc_utils.py index 3bc30213..ad6641d2 100644 --- a/bsmetadata/preprocessing_tools/website_desc_utils.py +++ b/bsmetadata/preprocessing_tools/website_desc_utils.py @@ -2,6 +2,7 @@ from typing import Optional import nltk +import re from wikipedia2vec.dump_db import DumpDB @@ -22,6 +23,7 @@ def fetch_wikipedia_title_from_keyword(self, keyword: str) -> str: def fetch_wikipedia_description_for_title(self, title: str) -> Optional: try: text = self.wiki_dump_db.get_paragraphs(title)[0].text + text = re.sub(r"\((?:[^)(]|\([^)(]*\))*\)", "", text) text = nltk.sent_tokenize(text)[0] # Picking the first sentence except Exception: return None diff --git a/bsmetadata/preprocessing_utils.py b/bsmetadata/preprocessing_utils.py index b5478c8c..acd5f8c2 100644 --- a/bsmetadata/preprocessing_utils.py +++ b/bsmetadata/preprocessing_utils.py @@ -124,6 +124,7 @@ def _extract_website_desc_from_url(self, url: str) -> Optional: keyword = fetch_keyword_from_url(url) return self.website_utils.fetch_website_description_from_keyword(keyword) + class EntityPreprocessor(MetadataPreprocessor): """Metadata preprocessor for adding entity information.""" diff --git a/setup.py b/setup.py index 40f60afe..2bec9d37 100644 --- a/setup.py +++ b/setup.py @@ -21,5 +21,6 @@ def req_file(filename): install_requires=install_requires, extras_require={ "entity_preprocessing": ["REL @ git+https://github.com/manandey/REL.git#egg=REL"], + "website_description_preprocessing": ["wikipedia2vec==1.0.5", "nltk==3.6.5"], }, ) From 1611733401a5b3cd780c1dc5ded57a960a689a5a Mon Sep 17 00:00:00 2001 From: Shanya Sharma - s0s0cr3 Date: Tue, 23 Nov 2021 15:15:37 +0530 Subject: [PATCH 29/29] make quality --- bsmetadata/preprocessing_tools/website_desc_utils.py | 2 +- bsmetadata/preprocessing_utils.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bsmetadata/preprocessing_tools/website_desc_utils.py b/bsmetadata/preprocessing_tools/website_desc_utils.py index ad6641d2..942bab71 100644 --- a/bsmetadata/preprocessing_tools/website_desc_utils.py +++ b/bsmetadata/preprocessing_tools/website_desc_utils.py @@ -1,8 +1,8 @@ +import re from collections import defaultdict from typing import Optional import nltk -import re from wikipedia2vec.dump_db import DumpDB diff --git a/bsmetadata/preprocessing_utils.py b/bsmetadata/preprocessing_utils.py index acd5f8c2..85fcfd90 100644 --- a/bsmetadata/preprocessing_utils.py +++ b/bsmetadata/preprocessing_utils.py @@ -18,12 +18,12 @@ from typing import Dict, List, Optional from urllib.parse import unquote, urlsplit -from bsmetadata.preprocessing_tools.website_desc_utils import WebsiteDescUtils from REL.entity_disambiguation import EntityDisambiguation from REL.mention_detection import MentionDetection from REL.ner import load_flair_ner from REL.utils import process_results +from bsmetadata.preprocessing_tools.website_desc_utils import WebsiteDescUtils from bsmetadata.vendor.dateutil.src.dateutil.parser import ParserError, parse @@ -124,7 +124,8 @@ def _extract_website_desc_from_url(self, url: str) -> Optional: keyword = fetch_keyword_from_url(url) return self.website_utils.fetch_website_description_from_keyword(keyword) - + + class EntityPreprocessor(MetadataPreprocessor): """Metadata preprocessor for adding entity information."""