-
Notifications
You must be signed in to change notification settings - Fork 11
Adding WebsiteMetadataProcessor to preprocessing_utils #49
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5d6a2a2
80dcd7a
742d2db
6cd8389
7aa1797
561867a
7a59eba
fce9980
ab7bd72
60e6620
0299c88
5a3f0f1
2bf8ca2
1ef9493
19ff034
a6761db
9afa782
46773a9
8cd384d
e6e0342
a3785e3
a82cd78
8160bda
f8c05e2
7ea07b4
7ef1d9a
b251109
ca9c9d0
fe9a228
e185fbe
21c15c1
81af40a
1611733
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
|
|
||
|
|
||
| out_dir=${1:-bsmetadata/preprocessing_data} # default director: preprocessing_data | ||
|
|
||
| ## Clone the huggingface dataset repo containing wiki dump | ||
| mkdir -p "$out_dir" | ||
| HUB_REPO_NAME=bs-modeling-metadata/wiki_dump | ||
| git clone https://huggingface.co/datasets/${HUB_REPO_NAME} $out_dir/wiki_dump | ||
|
|
||
|
|
||
| ## Downloading nltk punkt to be used in sentence tokenizer | ||
| python -m nltk.downloader 'punkt' |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| import re | ||
| from collections import defaultdict | ||
| from typing import Optional | ||
|
|
||
| import nltk | ||
| from wikipedia2vec.dump_db import DumpDB | ||
|
|
||
|
|
||
| class WebsiteDescUtils: | ||
| def __init__(self, path_wiki_db) -> None: | ||
| self.cache = defaultdict(str) | ||
| self.wiki_dump_db = DumpDB(path_wiki_db) | ||
| self.redirects_map = { | ||
| key.lower(): value for key, value in self.wiki_dump_db.redirects() | ||
| } # loading all redirect information: takes ~10s | ||
|
|
||
| def fetch_wikipedia_title_from_keyword(self, keyword: str) -> str: | ||
| title = self.redirects_map.get( | ||
| keyword, keyword.split(".")[0].capitalize() | ||
| ) # fallback to default for cases where domain is not recognized. We'll try to hit the db with the exact keyword directly (e.g. rightmove.com -> Rightmove) Capitalizing since wikipedia titles are so | ||
| return title | ||
|
|
||
| def fetch_wikipedia_description_for_title(self, title: str) -> Optional: | ||
| try: | ||
| text = self.wiki_dump_db.get_paragraphs(title)[0].text | ||
| text = re.sub(r"\((?:[^)(]|\([^)(]*\))*\)", "", text) | ||
| text = nltk.sent_tokenize(text)[0] # Picking the first sentence | ||
| except Exception: | ||
| return None | ||
| return text | ||
|
|
||
| def extract_wiki_desc(self, keyword: str) -> Optional: | ||
|
|
||
| title = self.fetch_wikipedia_title_from_keyword(keyword) | ||
| desc = self.fetch_wikipedia_description_for_title(title) | ||
| return desc | ||
|
|
||
| def fetch_website_description_from_keyword(self, keyword: str) -> Optional: | ||
| if not self.cache[keyword]: | ||
| self.cache[keyword] = self.extract_wiki_desc(keyword) | ||
|
|
||
| return self.cache[keyword] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,3 +4,5 @@ wandb>=0.10.32,<1 # pip will likely update it to 0.12.1, but it is probably ok | |
| transformers>=4.6.0,<5 # pip will likely update it to 4.10.0, but it is probably ok and good for bugfixes. | ||
| accelerate>=0.4.0,<1 # We may want to use 0.5.0 in the near future | ||
| datasets[streaming]>=1.11.0,<2 | ||
| wikipedia2vec==1.0.5 | ||
| nltk==3.6.5 | ||
|
Comment on lines
+7
to
+8
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Really a tiny detail, but I think we can make these dependencies optional by adding them to the
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thank you for pointing this out @SaulLu :) I've made the required changes |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| from typing import List | ||
|
|
||
|
|
||
| class MockParagraph: | ||
| def __init__(self, text): | ||
| self.text = text | ||
|
|
||
|
|
||
| class MockDumpDB: | ||
| def __init__(self, db_file) -> None: | ||
| self.db_file = db_file | ||
| self.redirect_info = [("xyz.com", "XYZ"), ("test.com", "Test"), ("test_key", "Test Key")] | ||
| self.paragraphs_map = { | ||
| "XYZ": [ | ||
| MockParagraph("XYZ is a U.S. based company."), | ||
| MockParagraph("Test paragraph for the key XYZ."), | ||
| ], | ||
| "Test": [ | ||
| MockParagraph("Test is a U.S. based company."), | ||
| MockParagraph("Test paragraph for the key Test."), | ||
| ], | ||
| "Sometitle": [ | ||
| MockParagraph("SomeTitle is a U.S. based company."), | ||
| MockParagraph("Test paragraph for the key SomeTitle."), | ||
| ], | ||
| } | ||
|
|
||
| def redirects(self) -> List[tuple]: | ||
| return self.redirect_info | ||
|
|
||
| def get_paragraphs(self, title: str): | ||
| return self.paragraphs_map[title] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,55 @@ | ||
| import unittest | ||
| from unittest import mock | ||
|
|
||
| from datasets import Dataset | ||
| from mocks.mock_dump_db import MockDumpDB | ||
|
|
||
| from bsmetadata.preprocessing_utils import WebsiteDescPreprocessor | ||
|
|
||
|
|
||
| def mock_sent_tokenize(text): | ||
| return [text] | ||
|
|
||
|
|
||
| class WebsiteDescPreprocessorTester(unittest.TestCase): | ||
| @mock.patch("bsmetadata.preprocessing_tools.website_desc_utils.DumpDB") | ||
| def setUp(self, mock_db) -> None: | ||
| mock_db.return_value = MockDumpDB("some/path") | ||
| self.website_processor = WebsiteDescPreprocessor() | ||
| self.example_ids = [0, 1, 2] | ||
| self.example_text = ["test text 1", "test text 2", "test text 3"] | ||
| self.example_metadata = [ | ||
| [{"key": "url", "type": "global", "value": "https://www.xyz.com"}], | ||
| [ | ||
| {"key": "url", "type": "global", "value": "http://sometitle.com"}, | ||
| {"key": "url", "type": "global", "value": "http://notfound.com"}, | ||
| ], | ||
| [{"key": "url", "type": "global", "value": "https://www.test.com"}], | ||
| ] | ||
|
|
||
| self.example_dict = {"id": self.example_ids, "metadata": self.example_metadata, "text": self.example_text} | ||
|
|
||
| @mock.patch("bsmetadata.preprocessing_tools.website_desc_utils.nltk.sent_tokenize", new=mock_sent_tokenize) | ||
| def test_website_metadata_processor(self): | ||
| ds = Dataset.from_dict(self.example_dict) | ||
| ds = ds.map(lambda ex: self.website_processor.preprocess(ex), batched=True) | ||
| target_metadata = [ | ||
| [ | ||
| {"key": "url", "type": "global", "value": "https://www.xyz.com"}, | ||
| {"key": "website_description", "type": "global", "value": "XYZ is a U.S. based company."}, | ||
| ], | ||
| [ | ||
| {"key": "url", "type": "global", "value": "http://sometitle.com"}, | ||
| {"key": "url", "type": "global", "value": "http://notfound.com"}, | ||
| {"key": "website_description", "type": "global", "value": "SomeTitle is a U.S. based company."}, | ||
| ], | ||
| [ | ||
| {"key": "url", "type": "global", "value": "https://www.test.com"}, | ||
| {"key": "website_description", "type": "global", "value": "Test is a U.S. based company."}, | ||
| ], | ||
| ] | ||
| self.assertEqual(ds[:]["metadata"], target_metadata) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| unittest.main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it maybe make sense to remove information in brackets? For example, the first sentence in the Wikipedia article on Wikipedia itself is
Wikipedia (/ˌwɪkɪˈpiːdiə/ (listen) wik-ih-PEE-dee-ə or /ˌwɪki-/ (listen) wik-ee-) is a free content, multilingual online encyclopedia written and maintained by a community of volunteers through a model of open collaboration, using a wiki-based editing system.At least in this case, I would think that the part in brackets (/ˌwɪkɪˈpiːdiə/ (listen) wik-ih-PEE-dee-ə or /ˌwɪki-/ (listen) wik-ee-) is completely useless for the model but will probably require many tokens. Any thoughts on that?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds like a great idea!
If it saves you some time @shanyas10 , here is a little code snippet that will remove the text in parentheses:
On @timoschick's example, it will output:
Wikipedia is a free content, multilingual online encyclopedia written and maintained by a community of volunteers through a model of open collaboration, using a wiki-based editing system.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes a lot of sense @timoschick .
Thank you for the snippet @SaulLu . I've made the changes accordingly