Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
5d6a2a2
Update preprocessing_utils.py
Oct 15, 2021
80dcd7a
Update preprocessing_utils.py
Oct 15, 2021
742d2db
Merge branch 'master' into ds_preprocessing_website
Oct 29, 2021
6cd8389
Merge branch 'master' into ds_preprocessing_website
Oct 31, 2021
7aa1797
adding processor for website metadata
Nov 1, 2021
561867a
Create download_wiki_dump.sh
Nov 1, 2021
7a59eba
run style and quality checks
Nov 1, 2021
fce9980
Update website_desc_utils.py
Nov 1, 2021
ab7bd72
adding tokenization for sentence
Nov 1, 2021
60e6620
Update download_wiki_dump.sh
Nov 1, 2021
0299c88
add test
Nov 1, 2021
5a3f0f1
Update preprocessing_utils.py
Nov 1, 2021
2bf8ca2
Update preprocessing_utils.py
Nov 1, 2021
1ef9493
Update test_preprocessing_utils.py
Nov 1, 2021
19ff034
Update test_preprocessing_utils.py
Nov 1, 2021
a6761db
adding tests
Nov 1, 2021
9afa782
fixing a bug in mocking
Nov 1, 2021
46773a9
Update test.yml
Nov 1, 2021
8cd384d
updating name in workflow
Nov 1, 2021
e6e0342
adding nltk to requirements
Nov 1, 2021
a3785e3
Merge branch 'ds_preprocessing_website' into test_ds_preprocess
Nov 1, 2021
a82cd78
Update test_preprocessing_utils.py
Nov 1, 2021
8160bda
Update test_preprocessing_utils.py
Nov 1, 2021
f8c05e2
fixing tests
Nov 1, 2021
7ea07b4
reverting changes from test
Nov 1, 2021
7ef1d9a
fixing quality
Nov 1, 2021
b251109
modifying script and deleting extra file
Nov 2, 2021
ca9c9d0
Update preprocessing_utils.py
Nov 2, 2021
fe9a228
Update download_wiki_dump.sh
Nov 2, 2021
e185fbe
Update preprocessing_utils.py
Nov 2, 2021
21c15c1
Merge branch 'master' into ds_preprocessing_website
Nov 23, 2021
81af40a
addressing PR comments
Nov 23, 2021
1611733
make quality
Nov 23, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions bsmetadata/preprocessing_scripts/download_wiki_dump.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@


out_dir=${1:-bsmetadata/preprocessing_data} # default director: preprocessing_data

## Clone the huggingface dataset repo containing wiki dump
mkdir -p "$out_dir"
HUB_REPO_NAME=bs-modeling-metadata/wiki_dump
git clone https://huggingface.co/datasets/${HUB_REPO_NAME} $out_dir/wiki_dump


## Downloading nltk punkt to be used in sentence tokenizer
python -m nltk.downloader 'punkt'
42 changes: 42 additions & 0 deletions bsmetadata/preprocessing_tools/website_desc_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import re
from collections import defaultdict
from typing import Optional

import nltk
from wikipedia2vec.dump_db import DumpDB


class WebsiteDescUtils:
def __init__(self, path_wiki_db) -> None:
self.cache = defaultdict(str)
self.wiki_dump_db = DumpDB(path_wiki_db)
self.redirects_map = {
key.lower(): value for key, value in self.wiki_dump_db.redirects()
} # loading all redirect information: takes ~10s

def fetch_wikipedia_title_from_keyword(self, keyword: str) -> str:
title = self.redirects_map.get(
keyword, keyword.split(".")[0].capitalize()
) # fallback to default for cases where domain is not recognized. We'll try to hit the db with the exact keyword directly (e.g. rightmove.com -> Rightmove) Capitalizing since wikipedia titles are so
return title

def fetch_wikipedia_description_for_title(self, title: str) -> Optional:
try:
text = self.wiki_dump_db.get_paragraphs(title)[0].text
text = re.sub(r"\((?:[^)(]|\([^)(]*\))*\)", "", text)
text = nltk.sent_tokenize(text)[0] # Picking the first sentence
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it maybe make sense to remove information in brackets? For example, the first sentence in the Wikipedia article on Wikipedia itself is Wikipedia (/ˌwɪkɪˈpiːdiə/ (listen) wik-ih-PEE-dee-ə or /ˌwɪki-/ (listen) wik-ee-) is a free content, multilingual online encyclopedia written and maintained by a community of volunteers through a model of open collaboration, using a wiki-based editing system. At least in this case, I would think that the part in brackets (/ˌwɪkɪˈpiːdiə/ (listen) wik-ih-PEE-dee-ə or /ˌwɪki-/ (listen) wik-ee-) is completely useless for the model but will probably require many tokens. Any thoughts on that?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds like a great idea!

If it saves you some time @shanyas10 , here is a little code snippet that will remove the text in parentheses:

import re

text = re.sub(r"\((?:[^)(]|\([^)(]*\))*\)", "", text)

On @timoschick's example, it will output: Wikipedia is a free content, multilingual online encyclopedia written and maintained by a community of volunteers through a model of open collaboration, using a wiki-based editing system.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes a lot of sense @timoschick .
Thank you for the snippet @SaulLu . I've made the changes accordingly

except Exception:
return None
return text

def extract_wiki_desc(self, keyword: str) -> Optional:

title = self.fetch_wikipedia_title_from_keyword(keyword)
desc = self.fetch_wikipedia_description_for_title(title)
return desc

def fetch_website_description_from_keyword(self, keyword: str) -> Optional:
if not self.cache[keyword]:
self.cache[keyword] = self.extract_wiki_desc(keyword)

return self.cache[keyword]
38 changes: 38 additions & 0 deletions bsmetadata/preprocessing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from REL.ner import load_flair_ner
from REL.utils import process_results

from bsmetadata.preprocessing_tools.website_desc_utils import WebsiteDescUtils
from bsmetadata.vendor.dateutil.src.dateutil.parser import ParserError, parse


Expand All @@ -42,6 +43,11 @@ def parse_date(path):
return None


def fetch_keyword_from_url(url: str) -> str: # e.g http://www.californialandcan.org/Plumas -> californialandcan.org
domain = urlsplit(url).netloc
return domain.replace("www.", "")


def remove_improbable_date(x):
if x is not None and (x.year < 1983 or x.year > 2021):
return None
Expand Down Expand Up @@ -88,6 +94,38 @@ def _extract_timestamp_from_url(self, url: str) -> Optional[str]:
return date


class WebsiteDescPreprocessor(MetadataPreprocessor):
"""Metadata preprocessor for adding website description based on URLs."""

def __init__(self, path_wiki_db: str = "../preprocessing_data/wiki_dump/wiki_en_dump_db") -> None:
self.website_utils = WebsiteDescUtils(path_wiki_db)
super().__init__()

def preprocess(self, examples: Dict[str, List]) -> Dict[str, List]:

metadata_list = examples["metadata"]

# Iterate through the metadata associated with all examples in this batch.
for metadata in metadata_list:
# Get the URL associated with this example.
urls = [md["value"] for md in metadata if md["key"] == "url"]

if not urls:
continue

# Try to extract a website description from the given URL and add it to the metadata.
website_description = self._extract_website_desc_from_url(urls[0])

if website_description:
metadata.append({"key": "website_description", "type": "global", "value": website_description})
return examples

def _extract_website_desc_from_url(self, url: str) -> Optional:

keyword = fetch_keyword_from_url(url)
return self.website_utils.fetch_website_description_from_keyword(keyword)


class EntityPreprocessor(MetadataPreprocessor):
"""Metadata preprocessor for adding entity information."""

Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ wandb>=0.10.32,<1 # pip will likely update it to 0.12.1, but it is probably ok
transformers>=4.6.0,<5 # pip will likely update it to 4.10.0, but it is probably ok and good for bugfixes.
accelerate>=0.4.0,<1 # We may want to use 0.5.0 in the near future
datasets[streaming]>=1.11.0,<2
wikipedia2vec==1.0.5
nltk==3.6.5
Comment on lines +7 to +8
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Really a tiny detail, but I think we can make these dependencies optional by adding them to the setup.py in extras_require 🙂 :

setup(
    name="bsmetadata",
    python_requires=">=3.7.11, <3.10",
    version="0.1.0",
    url="https://github.com/bigscience-workshop/metadata.git",
    author="Multiple Authors",
    author_email="xxx",
    description="Codebase for including metadata (e.g., URLs, timestamps, HTML tags) during language model pretraining.",
    packages=find_packages(),
    install_requires=install_requires,
    extras_require={
        "website_description_preprocessing": ["wikipedia2vec==1.0.5", "nltk==3.6.5"],
    },
)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thank you for pointing this out @SaulLu :) I've made the required changes

1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,6 @@ def req_file(filename):
install_requires=install_requires,
extras_require={
"entity_preprocessing": ["REL @ git+https://github.com/manandey/REL.git#egg=REL"],
"website_description_preprocessing": ["wikipedia2vec==1.0.5", "nltk==3.6.5"],
},
)
32 changes: 32 additions & 0 deletions tests/mocks/mock_dump_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import List


class MockParagraph:
def __init__(self, text):
self.text = text


class MockDumpDB:
def __init__(self, db_file) -> None:
self.db_file = db_file
self.redirect_info = [("xyz.com", "XYZ"), ("test.com", "Test"), ("test_key", "Test Key")]
self.paragraphs_map = {
"XYZ": [
MockParagraph("XYZ is a U.S. based company."),
MockParagraph("Test paragraph for the key XYZ."),
],
"Test": [
MockParagraph("Test is a U.S. based company."),
MockParagraph("Test paragraph for the key Test."),
],
"Sometitle": [
MockParagraph("SomeTitle is a U.S. based company."),
MockParagraph("Test paragraph for the key SomeTitle."),
],
}

def redirects(self) -> List[tuple]:
return self.redirect_info

def get_paragraphs(self, title: str):
return self.paragraphs_map[title]
55 changes: 55 additions & 0 deletions tests/test_preprocessing_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import unittest
from unittest import mock

from datasets import Dataset
from mocks.mock_dump_db import MockDumpDB

from bsmetadata.preprocessing_utils import WebsiteDescPreprocessor


def mock_sent_tokenize(text):
return [text]


class WebsiteDescPreprocessorTester(unittest.TestCase):
@mock.patch("bsmetadata.preprocessing_tools.website_desc_utils.DumpDB")
def setUp(self, mock_db) -> None:
mock_db.return_value = MockDumpDB("some/path")
self.website_processor = WebsiteDescPreprocessor()
self.example_ids = [0, 1, 2]
self.example_text = ["test text 1", "test text 2", "test text 3"]
self.example_metadata = [
[{"key": "url", "type": "global", "value": "https://www.xyz.com"}],
[
{"key": "url", "type": "global", "value": "http://sometitle.com"},
{"key": "url", "type": "global", "value": "http://notfound.com"},
],
[{"key": "url", "type": "global", "value": "https://www.test.com"}],
]

self.example_dict = {"id": self.example_ids, "metadata": self.example_metadata, "text": self.example_text}

@mock.patch("bsmetadata.preprocessing_tools.website_desc_utils.nltk.sent_tokenize", new=mock_sent_tokenize)
def test_website_metadata_processor(self):
ds = Dataset.from_dict(self.example_dict)
ds = ds.map(lambda ex: self.website_processor.preprocess(ex), batched=True)
target_metadata = [
[
{"key": "url", "type": "global", "value": "https://www.xyz.com"},
{"key": "website_description", "type": "global", "value": "XYZ is a U.S. based company."},
],
[
{"key": "url", "type": "global", "value": "http://sometitle.com"},
{"key": "url", "type": "global", "value": "http://notfound.com"},
{"key": "website_description", "type": "global", "value": "SomeTitle is a U.S. based company."},
],
[
{"key": "url", "type": "global", "value": "https://www.test.com"},
{"key": "website_description", "type": "global", "value": "Test is a U.S. based company."},
],
]
self.assertEqual(ds[:]["metadata"], target_metadata)


if __name__ == "__main__":
unittest.main()