From d478b7b7c80502b09a1d3fb5237ad9113f072b00 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Wed, 21 May 2025 10:47:17 +0000 Subject: [PATCH] Refactor: Improve error handling, testing, and documentation. This commit introduces several improvements to the codebase: - **Bug Fix:** - `defs/etl_fetch.py`: Corrected the `fetch_resources` function to use the `source` parameter instead of a hardcoded URL. - **Testing:** - Established a unit testing framework using `unittest`. - Added tests for `defs/etl_fetch.py` (mocking `AsyncWebCrawler`). - Added tests for `defs/etl_convert.py` (including HTML and PDF conversion, with programmatic PDF fixture creation using `reportlab`). - `bamlTest.py` identified as a utility script; its purpose is now documented. - **Error Handling & Logging:** - Python scripts in `defs/` (etl_convert, etl_fetch, etl_query) now use Python's `logging` module for errors, warnings, and info messages, replacing `print()`. - `defs/etl_query.py`: Enhanced CSV parsing to specifically catch `polars.exceptions.ShapeError` when `truncate_ragged_lines=False`, logging an informative error and re-raising. - Shell scripts (`scripts/loadDirToTriplestore.sh`, `scripts/loadSitemapToTriplestore.sh`): - Added `set -e` for safer execution. - Added checks for `jsonld` and `curl` command existence. - Implemented robust error checking for `jsonld` and `curl` command failures during processing, with errors reported to stderr. - `loadSitemapToTriplestore.sh` now includes a summary of processed URLs and failures. - **Documentation (README.md):** - Added a new "Running Tests" section with instructions. - Documented the dependency on the `jsonld` command-line tool, including installation instructions (e.g., via `npm`). - Clarified the purpose of `bamlTest.py`. --- README.md | 23 +++++- defs/etl_convert.py | 20 +++--- defs/etl_fetch.py | 7 +- defs/etl_query.py | 28 ++++++-- scripts/loadDirToTriplestore.sh | 60 ++++++++++++++-- scripts/loadSitemapToTriplestore.sh | 105 ++++++++++++++++++++++++---- tests/__init__.py | 0 tests/test_etl_convert.py | 97 +++++++++++++++++++++++++ tests/test_etl_fetch.py | 29 ++++++++ 9 files changed, 330 insertions(+), 39 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/test_etl_convert.py create mode 100644 tests/test_etl_fetch.py diff --git a/README.md b/README.md index 30aa874..e19bf64 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,24 @@ I'm assuming you have set up a working environment with your triplestore and other systems you want. Note this repo is using the UV package management system. (see: https://docs.astral.sh/uv/) +Ensure all dependencies are installed by running `uv pip install -r requirements.txt` or by ensuring your `pyproject.toml` is up to date and then running `uv pip sync`. + +#### Dependencies +The shell scripts `scripts/loadDirToTriplestore.sh` and `scripts/loadSitemapToTriplestore.sh` rely on the `jsonld` command-line tool for processing JSON-LD data. A common implementation is the Node.js `jsonld-cli` package. +You can install it via npm: +```bash +npm install -g jsonld-cli +``` +For more information, visit: [https://www.npmjs.com/package/jsonld-cli](https://www.npmjs.com/package/jsonld-cli) + +### Running Tests + +Unit tests are located in the `tests/` directory. To run the tests, navigate to the root of the repository and execute the following command: + +```bash +python -m unittest discover -s tests +``` +This command will discover and run all tests within the `tests` directory. Make sure all project dependencies, including any test-specific dependencies like `reportlab` (which should be listed in `requirements.txt` or `pyproject.toml`), are installed using `uv`. ### Notebook prototype @@ -41,7 +59,7 @@ We can run these: ./scripts/github_jsonld_sitemap.py --output output/jldnew-sitemap.xml https://github.com/bio-xyz/BioAgents sampleJsonLdsNew ``` -To load out JSON-LD now, we can use the sitemap to pull the resources directly from GitHub. +To load out JSON-LD now, we can use the sitemap to pull the resources directly from GitHub. (Make sure you have `jsonld-cli` installed, see "Dependencies" section under "Setup"). ```bash ./scripts/loadSitemapToTriplestore.sh ./output/jld-sitemap.xml http://homelab.lan:7878/store @@ -96,6 +114,8 @@ Use the code bamlTest.py to use OpenAI (set the key with something like) export OPENAI_API_KEY="..." ``` +`bamlTest.py` is a utility script for sending a markdown file to BAML functions (`ExtractIdea` or `ExtractAssertion`) and saving the JSON output. It can be used for manual testing or exploration of these BAML functions. + > Note: Since this is using [BAML](https://github.com/BoundaryML/baml) it's easy to > modify [clients.baml](baml_src/clients.baml) and add in any client. Ollama, for local, > Xai, Google Gemini, etc. You will then need to modify the ``` client "openai/gpt-4o" ``` @@ -123,4 +143,3 @@ References: * BioAgent repo: https://github.com/bio-xyz/plugin-bioagent * DKG (origin trail): https://docs.origintrail.io/build-with-dkg/quickstart-test-drive-the-dkg-in-5-mins - diff --git a/defs/etl_convert.py b/defs/etl_convert.py index b8b2df2..d9daa4e 100644 --- a/defs/etl_convert.py +++ b/defs/etl_convert.py @@ -1,5 +1,6 @@ import requests import os +import logging import tempfile from urllib.parse import urlparse @@ -10,10 +11,11 @@ import html2text import PyPDF2 except ImportError: - print("Required libraries not found. Please install them using:") - print("pip install html2text PyPDF2") + logger.exception("Failed to import html2text or PyPDF2. Please install them using: pip install html2text PyPDF2") exit(1) +logger = logging.getLogger(__name__) + def is_url(path): """Check if the given path is a URL.""" try: @@ -48,7 +50,7 @@ def download_file(url): temp_file.close() return temp_file_path except Exception as e: - print(f"Error downloading file: {e}") + logger.exception(f"Error downloading file: {e}") return None def html_to_markdown(html_content): @@ -79,7 +81,7 @@ def pdf_to_markdown(pdf_path): return markdown_text except Exception as e: - print(f"Error converting PDF to Markdown: {e}") + logger.exception(f"Error converting PDF to Markdown: {e}") return None def convert_to_markdown(source, is_local=False): @@ -112,7 +114,7 @@ def convert_to_markdown(source, is_local=False): return html_to_markdown(html_content) except Exception as e: - print(f"Error converting to Markdown: {e}") + logger.exception(f"Error converting to Markdown: {e}") return None def convert_document(url=None, local_file=None, output_file=None): @@ -128,11 +130,11 @@ def convert_document(url=None, local_file=None, output_file=None): str: The Markdown content or None if conversion failed. """ if url and local_file: - print("Error: Please provide either a URL or a local file, not both.") + logger.error("Error: Please provide either a URL or a local file, not both.") return None if not url and not local_file: - print("Error: Please provide either a URL or a local file.") + logger.error("Error: Please provide either a URL or a local file.") return None # Convert the document @@ -149,8 +151,8 @@ def convert_document(url=None, local_file=None, output_file=None): try: with open(output_file, 'w', encoding='utf-8') as file: file.write(markdown_content) - print(f"Markdown saved to {output_file}") + logger.info(f"Markdown saved to {output_file}") except Exception as e: - print(f"Error saving Markdown to file: {e}") + logger.exception(f"Error saving Markdown to file: {e}") return markdown_content \ No newline at end of file diff --git a/defs/etl_fetch.py b/defs/etl_fetch.py index 60f0a59..2c080b9 100644 --- a/defs/etl_fetch.py +++ b/defs/etl_fetch.py @@ -7,13 +7,14 @@ import asyncio from crawl4ai import * +import logging +logger = logging.getLogger(__name__) async def fetch_resources(source): async with AsyncWebCrawler() as crawler: result = await crawler.arun( - # url="https://www.waterqualitydata.us" - url="https://www.hydrosheds.org/hydrosheds-core-downloads", + url=source, ) - print(result.markdown) \ No newline at end of file + logger.info(result.markdown) \ No newline at end of file diff --git a/defs/etl_query.py b/defs/etl_query.py index f7a74f3..cffefe8 100644 --- a/defs/etl_query.py +++ b/defs/etl_query.py @@ -4,11 +4,13 @@ import lancedb import polars as pl import requests +import logging +logger = logging.getLogger(__name__) def query_mode(source, sink, query, table): """Handle query mode operations""" - print(f"Query mode: Processing data from {source} to {sink}") + logger.info(f"Query mode: Processing data from {source} to {sink}") # Add query-specific logic here # Qlever params, not needed for oxigraph @@ -30,13 +32,25 @@ def query_mode(source, sink, query, table): response = requests.post(source, params=params, headers=headers, data=query) # Load response into Polars DataFrame - # df = pl.read_csv(StringIO(response.text)) - df = pl.read_csv(StringIO(response.text), truncate_ragged_lines=False) - - # print(df) + try: + # df = pl.read_csv(StringIO(response.text)) + df = pl.read_csv(StringIO(response.text), truncate_ragged_lines=False) + except pl.exceptions.ShapeError as e: + logger.exception( + "Failed to parse CSV response from SPARQL query due to ragged lines. " + "The number of columns is inconsistent across rows. " + f"Polars error: {e}" + ) + # Re-raise the exception as per requirement + raise + except Exception as e: + logger.exception(f"An unexpected error occurred while parsing CSV response: {e}") + raise + + # logger.info(df) # # Create or get LanceDB table and write data - print("Saving LanceDB table: ", table, "") + logger.info(f"Saving LanceDB table: {table}") db = lancedb.connect("./stores/lancedb") tbl = db.create_table(table, data=df, mode="overwrite") - print(tbl) + logger.info(tbl) diff --git a/scripts/loadDirToTriplestore.sh b/scripts/loadDirToTriplestore.sh index b6f1fcd..6045907 100644 --- a/scripts/loadDirToTriplestore.sh +++ b/scripts/loadDirToTriplestore.sh @@ -1,17 +1,67 @@ #!/bin/bash +set -e # Exit on error + +# Check if jsonld command exists +if ! command -v jsonld &> /dev/null; then + echo "Error: jsonld command not found. Please install it." >&2 + exit 1 +fi + +# Check if curl command exists +if ! command -v curl &> /dev/null; then + echo "Error: curl command not found. Please install it." >&2 + exit 1 +fi + # A wrapper script for loading RDF from a directory into a triplestore # Usage # ./jsonldDirLoader.sh ./jsonld/depth_strict http://nas.lan:49153/blazegraph/namespace/kb/sparql mc_cmd() { - find $1 -type f # kinda basic, might add a filter to it + find "$1" -type f # kinda basic, might add a filter to it } # If you use this for ntriples, be sure to compute and/or add in a graph in the URL target -for i in $(mc_cmd $1); do +for i in $(mc_cmd "$1"); do echo "-------------start-------------" - echo Next: $i - cat $i | jsonld format -q | curl -X POST -H 'Content-Type:text/x-nquads' --data-binary @- $2 + echo "Processing: $i" + + # Attempt to format the file with jsonld + # Capture stdout and stderr separately for jsonld + jsonld_output=$(cat "$i" | jsonld format -q 2> /tmp/jsonld_error.log) + jsonld_exit_code=$? + + if [ $jsonld_exit_code -ne 0 ]; then + echo "Error formatting $i with jsonld (exit code: $jsonld_exit_code). Skipping." >&2 + # Print jsonld error log if it exists + if [ -s /tmp/jsonld_error.log ]; then + cat /tmp/jsonld_error.log >&2 + fi + rm -f /tmp/jsonld_error.log # Clean up error log + echo "-------------done (failed formatting)--------------" >&2 + continue + fi + rm -f /tmp/jsonld_error.log # Clean up error log if successful + + # Attempt to upload the formatted data + # Capture stdout and stderr separately for curl + curl_output=$(echo "$jsonld_output" | curl -X POST -H 'Content-Type:text/x-nquads' --data-binary @- "$2" 2> /tmp/curl_error.log) + curl_exit_code=$? + + if [ $curl_exit_code -ne 0 ]; then + echo "Error uploading $i to $2 (exit code: $curl_exit_code). Skipping." >&2 + # Print curl error log if it exists + if [ -s /tmp/curl_error.log ]; then + cat /tmp/curl_error.log >&2 + fi + rm -f /tmp/curl_error.log # Clean up error log + echo "-------------done (failed upload)--------------" >&2 + continue + fi + rm -f /tmp/curl_error.log # Clean up error log if successful + + echo "Successfully processed and uploaded $i" + # Optionally print curl output if needed for success confirmation + # echo "Server response: $curl_output" echo "-------------done--------------" done - diff --git a/scripts/loadSitemapToTriplestore.sh b/scripts/loadSitemapToTriplestore.sh index 3cce2ac..9d35bc6 100755 --- a/scripts/loadSitemapToTriplestore.sh +++ b/scripts/loadSitemapToTriplestore.sh @@ -1,12 +1,26 @@ #!/bin/bash +set -e # Exit on error + +# Check if jsonld command exists +if ! command -v jsonld &> /dev/null; then + echo "Error: jsonld command not found. Please install it." >&2 + exit 1 +fi + +# Check if curl command exists +if ! command -v curl &> /dev/null; then + echo "Error: curl command not found. Please install it." >&2 + exit 1 +fi + # A script for loading JSON-LD from a sitemap into a triplestore # Usage # ./loadSitemapToTriplestore.sh ../output/jld-sitemap.xml http://homelab.lan:7878/store # Check if required parameters are provided if [ $# -ne 2 ]; then - echo "Usage: $0 " - echo "Example: $0 ../output/jld-sitemap.xml http://homelab.lan:7878/store" + echo "Usage: $0 " >&2 + echo "Example: $0 ../output/jld-sitemap.xml http://homelab.lan:7878/store" >&2 exit 1 fi @@ -15,7 +29,7 @@ TRIPLESTORE_ENDPOINT=$2 # Check if the sitemap file exists if [ ! -f "$SITEMAP_FILE" ]; then - echo "Error: Sitemap file '$SITEMAP_FILE' not found." + echo "Error: Sitemap file '$SITEMAP_FILE' not found." >&2 exit 1 fi @@ -29,26 +43,80 @@ URLS=$(grep -o '.*' "$SITEMAP_FILE" | sed 's/\(.*\)<\/loc>/\1/') # Process each URL count=0 -total=$(echo "$URLS" | wc -l) -echo "Found $total URLs to process" +processed_count=0 +failed_downloads=0 +failed_formatting=0 +failed_uploads=0 + +total_urls=$(echo "$URLS" | wc -l | xargs) # xargs to trim whitespace + +echo "Found $total_urls URLs to process" for url in $URLS; do count=$((count + 1)) echo "-------------start-------------" - echo "[$count/$total] Processing: $url" + echo "[$count/$total_urls] Processing: $url" # Download the content echo "Downloading content..." - TEMP_FILE="$TEMP_DIR/$(basename "$url")" - curl -s "$url" -o "$TEMP_FILE" + # Add timestamp to basename to avoid name clashes if sitemap has duplicate basenames + TEMP_FILE="$TEMP_DIR/$(basename "$url")_$(date +%s%N)" + + curl -s -L "$url" -o "$TEMP_FILE" # Added -L to follow redirects + curl_download_exit_code=$? + + if [ $curl_download_exit_code -ne 0 ]; then + echo "Error downloading content from $url (exit code: $curl_download_exit_code). Skipping." >&2 + echo "-------------done (failed download)--------------" >&2 + failed_downloads=$((failed_downloads + 1)) + continue + fi if [ -s "$TEMP_FILE" ]; then - # Process and upload to triplestore echo "Processing and uploading to triplestore..." - cat "$TEMP_FILE" | jsonld format -q | curl -X POST -H 'Content-Type:text/x-nquads' --data-binary @- "$TRIPLESTORE_ENDPOINT" - echo "Upload complete." + + # Attempt to format the file with jsonld + # Capture stdout and stderr separately for jsonld + jsonld_output=$(cat "$TEMP_FILE" | jsonld format -q 2> /tmp/sitemap_jsonld_error.log) + jsonld_exit_code=$? + + if [ $jsonld_exit_code -ne 0 ]; then + echo "Error formatting content from $url with jsonld (exit code: $jsonld_exit_code). Skipping." >&2 + if [ -s /tmp/sitemap_jsonld_error.log ]; then + cat /tmp/sitemap_jsonld_error.log >&2 + fi + rm -f /tmp/sitemap_jsonld_error.log + echo "-------------done (failed formatting)--------------" >&2 + failed_formatting=$((failed_formatting + 1)) + continue + fi + rm -f /tmp/sitemap_jsonld_error.log # Clean up error log if successful + + # Attempt to upload the formatted data + # Capture stdout and stderr separately for curl + curl_upload_output=$(echo "$jsonld_output" | curl -X POST -H 'Content-Type:text/x-nquads' --data-binary @- "$TRIPLESTORE_ENDPOINT" 2> /tmp/sitemap_curl_error.log) + curl_upload_exit_code=$? + + if [ $curl_upload_exit_code -ne 0 ]; then + echo "Error uploading content from $url to $TRIPLESTORE_ENDPOINT (exit code: $curl_upload_exit_code). Skipping." >&2 + if [ -s /tmp/sitemap_curl_error.log ]; then + cat /tmp/sitemap_curl_error.log >&2 + fi + rm -f /tmp/sitemap_curl_error.log + echo "-------------done (failed upload)--------------" >&2 + failed_uploads=$((failed_uploads + 1)) + continue + fi + rm -f /tmp/sitemap_curl_error.log # Clean up error log if successful + + echo "Upload complete for $url." + # Optionally print curl output for success + # echo "Server response: $curl_upload_output" + processed_count=$((processed_count + 1)) else - echo "Error: Failed to download content from $url" + echo "Error: Failed to download content from $url (empty file received). Skipping." >&2 + echo "-------------done (empty download)--------------" >&2 + failed_downloads=$((failed_downloads + 1)) fi echo "-------------done--------------" @@ -57,4 +125,15 @@ done # Clean up echo "Cleaning up temporary files..." rm -rf "$TEMP_DIR" -echo "All done! Processed $count URLs." \ No newline at end of file +echo "All done!" +echo "Summary:" +echo "Total URLs found: $total_urls" +echo "Successfully processed and uploaded: $processed_count" +echo "Failed downloads: $failed_downloads" +echo "Failed formatting (jsonld): $failed_formatting" +echo "Failed uploads (curl): $failed_uploads" + +# Exit with a non-zero code if any failures occurred for easier scripting +if [ $((failed_downloads + failed_formatting + failed_uploads)) -gt 0 ]; then + exit 1 +fi \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_etl_convert.py b/tests/test_etl_convert.py new file mode 100644 index 0000000..144506b --- /dev/null +++ b/tests/test_etl_convert.py @@ -0,0 +1,97 @@ +import unittest +import os +import shutil # For cleaning up the fixtures directory +from defs.etl_convert import convert_document, html_to_markdown + +# Attempt to import reportlab, which is needed for creating a dummy PDF +try: + from reportlab.pdfgen import canvas + from reportlab.lib.pagesizes import letter + REPORTLAB_AVAILABLE = True +except ImportError: + REPORTLAB_AVAILABLE = False + +class TestEtlConvert(unittest.TestCase): + + def setUp(self): + """Set up test fixtures.""" + self.base_dir = os.path.dirname(__file__) + self.fixtures_dir = os.path.join(self.base_dir, "fixtures") + if not os.path.exists(self.fixtures_dir): + os.makedirs(self.fixtures_dir) + + def tearDown(self): + """Clean up test fixtures.""" + if os.path.exists(self.fixtures_dir): + shutil.rmtree(self.fixtures_dir) + + def test_html_to_markdown_conversion(self): + """Test direct HTML to Markdown conversion using html_to_markdown.""" + html_content = "

Test Heading

This is a test paragraph.

" + # html2text uses # for H1 + expected_markdown = "# Test Heading\n\nThis is a test paragraph." + markdown_content = html_to_markdown(html_content) + # Normalize newlines and strip whitespace for comparison + normalized_markdown = "\n".join(line.strip() for line in markdown_content.strip().splitlines() if line.strip()) + normalized_expected = "\n".join(line.strip() for line in expected_markdown.strip().splitlines() if line.strip()) + self.assertEqual(normalized_markdown, normalized_expected) + + def test_convert_document_html_local_file(self): + """Test converting a local HTML file to Markdown using convert_document.""" + dummy_html_path = os.path.join(self.fixtures_dir, "dummy.html") + html_content = "

Local HTML Test

A paragraph here.

" + # html2text uses # for H1 + expected_markdown_fragment = "# Local HTML Test\n\nA paragraph here." + + with open(dummy_html_path, "w", encoding="utf-8") as f: + f.write(html_content) + + markdown_content = convert_document(local_file=dummy_html_path) + self.assertIsNotNone(markdown_content) + # Normalize for comparison + normalized_markdown = "\n".join(line.strip() for line in markdown_content.strip().splitlines() if line.strip()) + normalized_expected = "\n".join(line.strip() for line in expected_markdown_fragment.strip().splitlines() if line.strip()) + self.assertIn(normalized_expected, normalized_markdown) + + @unittest.skipIf(not REPORTLAB_AVAILABLE, "reportlab is not installed, skipping PDF test.") + def test_convert_document_pdf_local_file(self): + """Test converting a local PDF file to Markdown using convert_document.""" + dummy_pdf_path = os.path.join(self.fixtures_dir, "dummy.pdf") + pdf_text_content = "This is a test PDF document for conversion." + + # Create a dummy PDF using reportlab + c = canvas.Canvas(dummy_pdf_path, pagesize=letter) + c.drawString(100, 750, pdf_text_content) + c.save() + + self.assertTrue(os.path.exists(dummy_pdf_path), "Dummy PDF file was not created.") + + markdown_content = convert_document(local_file=dummy_pdf_path) + self.assertIsNotNone(markdown_content, "Markdown conversion returned None.") + + # PyPDF2 adds "## Page X" headers. We need to check if the text content is present. + # The exact output can vary, so we check for the presence of the core text. + self.assertIn(pdf_text_content, markdown_content, + f"Expected text '{pdf_text_content}' not found in converted Markdown:\n{markdown_content}") + + def test_convert_document_no_source(self): + """Test convert_document with no source provided.""" + # Expecting it to print an error and return None + with unittest.mock.patch('builtins.print') as mocked_print: + result = convert_document() + self.assertIsNone(result) + mocked_print.assert_any_call("Error: Please provide either a URL or a local file.") + + def test_convert_document_both_sources(self): + """Test convert_document with both URL and local_file provided.""" + dummy_html_path = os.path.join(self.fixtures_dir, "dummy_both.html") + with open(dummy_html_path, "w") as f: + f.write("

test

") + + with unittest.mock.patch('builtins.print') as mocked_print: + result = convert_document(url="http://example.com", local_file=dummy_html_path) + self.assertIsNone(result) + mocked_print.assert_any_call("Error: Please provide either a URL or a local file, not both.") + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_etl_fetch.py b/tests/test_etl_fetch.py new file mode 100644 index 0000000..3e5fc23 --- /dev/null +++ b/tests/test_etl_fetch.py @@ -0,0 +1,29 @@ +import asyncio +import unittest +from unittest.mock import AsyncMock, patch + +# Adjust the import path based on your project structure if necessary +# This assumes that the 'defs' directory is in the PYTHONPATH +# or that tests are run from the root directory. +from defs.etl_fetch import fetch_resources + +class TestEtlFetch(unittest.IsolatedAsyncioTestCase): # Changed to IsolatedAsyncioTestCase + + @patch('defs.etl_fetch.AsyncWebCrawler') + async def test_fetch_resources_uses_source_url(self, MockAsyncWebCrawler): + # Configure the mock + mock_crawler_instance = MockAsyncWebCrawler.return_value + # For async context manager __aenter__ and __aexit__ needs to be AsyncMock + mock_crawler_instance.__aenter__ = AsyncMock(return_value=mock_crawler_instance) + mock_crawler_instance.__aexit__ = AsyncMock(return_value=None) + + # arun should also be an AsyncMock if it's awaited + mock_crawler_instance.arun = AsyncMock(return_value=type('obj', (object,), {'markdown': 'mocked markdown'})()) + + test_url = "http://example.com/test_source" + + await fetch_resources(test_url) + + mock_crawler_instance.arun.assert_called_once_with(url=test_url) + +# Removed the if __name__ == '__main__': block to rely on test discovery