From d478b7b7c80502b09a1d3fb5237ad9113f072b00 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Wed, 21 May 2025 10:47:17 +0000
Subject: [PATCH] Refactor: Improve error handling, testing, and documentation.

This commit introduces several improvements to the codebase:

- **Bug Fix:**
    - `defs/etl_fetch.py`: Corrected the `fetch_resources` function to use the `source` parameter instead of a hardcoded URL.

- **Testing:**
    - Established a unit testing framework using `unittest`.
    - Added tests for `defs/etl_fetch.py` (mocking `AsyncWebCrawler`).
    - Added tests for `defs/etl_convert.py` (including HTML and PDF conversion, with programmatic PDF fixture creation using `reportlab`).
    - `bamlTest.py` identified as a utility script; its purpose is now documented.

- **Error Handling & Logging:**
    - Python scripts in `defs/` (etl_convert, etl_fetch, etl_query) now use Python's `logging` module for errors, warnings, and info messages, replacing `print()`.
    - `defs/etl_query.py`: Enhanced CSV parsing to specifically catch `polars.exceptions.ShapeError` when `truncate_ragged_lines=False`, logging an informative error and re-raising.
    - Shell scripts (`scripts/loadDirToTriplestore.sh`, `scripts/loadSitemapToTriplestore.sh`):
        - Added `set -e` for safer execution.
        - Added checks for `jsonld` and `curl` command existence.
        - Implemented robust error checking for `jsonld` and `curl` command failures during processing, with errors reported to stderr.
        - `loadSitemapToTriplestore.sh` now includes a summary of processed URLs and failures.

- **Documentation (README.md):**
    - Added a new "Running Tests" section with instructions.
    - Documented the dependency on the `jsonld` command-line tool, including installation instructions (e.g., via `npm`).
    - Clarified the purpose of `bamlTest.py`.
---
 README.md                           |  23 +++++-
 defs/etl_convert.py                 |  20 +++---
 defs/etl_fetch.py                   |   7 +-
 defs/etl_query.py                   |  28 ++++++--
 scripts/loadDirToTriplestore.sh     |  60 ++++++++++++++--
 scripts/loadSitemapToTriplestore.sh | 105 ++++++++++++++++++++++++----
 tests/__init__.py                   |   0
 tests/test_etl_convert.py           |  97 +++++++++++++++++++++++++
 tests/test_etl_fetch.py             |  29 ++++++++
 9 files changed, 330 insertions(+), 39 deletions(-)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_etl_convert.py
 create mode 100644 tests/test_etl_fetch.py

diff --git a/README.md b/README.md
index 30aa874..e19bf64 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,24 @@
 
 I'm assuming you have set up a working environment with your triplestore and other systems you want.
 Note this repo is using the UV package management system. (see: https://docs.astral.sh/uv/)
+Ensure all dependencies are installed by running `uv pip install -r requirements.txt` or by ensuring your `pyproject.toml` is up to date and then running `uv pip sync`.
+
+#### Dependencies
+The shell scripts `scripts/loadDirToTriplestore.sh` and `scripts/loadSitemapToTriplestore.sh` rely on the `jsonld` command-line tool for processing JSON-LD data. A common implementation is the Node.js `jsonld-cli` package.
+You can install it via npm:
+```bash
+npm install -g jsonld-cli
+```
+For more information, visit: [https://www.npmjs.com/package/jsonld-cli](https://www.npmjs.com/package/jsonld-cli)
+
+### Running Tests
+
+Unit tests are located in the `tests/` directory. To run the tests, navigate to the root of the repository and execute the following command:
+
+```bash
+python -m unittest discover -s tests
+```
+This command will discover and run all tests within the `tests` directory. Make sure all project dependencies, including any test-specific dependencies like `reportlab` (which should be listed in `requirements.txt` or `pyproject.toml`), are installed using `uv`.
 
 ### Notebook prototype
 
@@ -41,7 +59,7 @@ We can run these:
 ./scripts/github_jsonld_sitemap.py --output output/jldnew-sitemap.xml https://github.com/bio-xyz/BioAgents sampleJsonLdsNew 
 ```
 
-To load out JSON-LD now, we can use the sitemap to pull the resources directly from GitHub.
+To load out JSON-LD now, we can use the sitemap to pull the resources directly from GitHub. (Make sure you have `jsonld-cli` installed, see "Dependencies" section under "Setup").
 
 ```bash
 ./scripts/loadSitemapToTriplestore.sh ./output/jld-sitemap.xml http://homelab.lan:7878/store
@@ -96,6 +114,8 @@ Use the code bamlTest.py to use OpenAI (set the key with something like)
 export OPENAI_API_KEY="..."
 ```
 
+`bamlTest.py` is a utility script for sending a markdown file to BAML functions (`ExtractIdea` or `ExtractAssertion`) and saving the JSON output. It can be used for manual testing or exploration of these BAML functions.
+
 > Note: Since this is using [BAML](https://github.com/BoundaryML/baml) it's easy to 
 > modify [clients.baml](baml_src/clients.baml) and add in any client.  Ollama, for local,
 > Xai, Google Gemini, etc.  You will then need to modify the ``` client "openai/gpt-4o" ```
@@ -123,4 +143,3 @@ References:
 
 * BioAgent repo: https://github.com/bio-xyz/plugin-bioagent
 * DKG (origin trail): https://docs.origintrail.io/build-with-dkg/quickstart-test-drive-the-dkg-in-5-mins
-
diff --git a/defs/etl_convert.py b/defs/etl_convert.py
index b8b2df2..d9daa4e 100644
--- a/defs/etl_convert.py
+++ b/defs/etl_convert.py
@@ -1,5 +1,6 @@
 import requests
 import os
+import logging
 import tempfile
 from urllib.parse import urlparse
 
@@ -10,10 +11,11 @@
     import html2text
     import PyPDF2
 except ImportError:
-    print("Required libraries not found. Please install them using:")
-    print("pip install html2text PyPDF2")
+    logger.exception("Failed to import html2text or PyPDF2. Please install them using: pip install html2text PyPDF2")
     exit(1)
 
+logger = logging.getLogger(__name__)
+
 def is_url(path):
     """Check if the given path is a URL."""
     try:
@@ -48,7 +50,7 @@ def download_file(url):
         temp_file.close()
         return temp_file_path
     except Exception as e:
-        print(f"Error downloading file: {e}")
+        logger.exception(f"Error downloading file: {e}")
         return None
 
 def html_to_markdown(html_content):
@@ -79,7 +81,7 @@ def pdf_to_markdown(pdf_path):
         
         return markdown_text
     except Exception as e:
-        print(f"Error converting PDF to Markdown: {e}")
+        logger.exception(f"Error converting PDF to Markdown: {e}")
         return None
 
 def convert_to_markdown(source, is_local=False):
@@ -112,7 +114,7 @@ def convert_to_markdown(source, is_local=False):
                 
             return html_to_markdown(html_content)
     except Exception as e:
-        print(f"Error converting to Markdown: {e}")
+        logger.exception(f"Error converting to Markdown: {e}")
         return None
 
 def convert_document(url=None, local_file=None, output_file=None):
@@ -128,11 +130,11 @@ def convert_document(url=None, local_file=None, output_file=None):
         str: The Markdown content or None if conversion failed.
     """
     if url and local_file:
-        print("Error: Please provide either a URL or a local file, not both.")
+        logger.error("Error: Please provide either a URL or a local file, not both.")
         return None
     
     if not url and not local_file:
-        print("Error: Please provide either a URL or a local file.")
+        logger.error("Error: Please provide either a URL or a local file.")
         return None
     
     # Convert the document
@@ -149,8 +151,8 @@ def convert_document(url=None, local_file=None, output_file=None):
         try:
             with open(output_file, 'w', encoding='utf-8') as file:
                 file.write(markdown_content)
-            print(f"Markdown saved to {output_file}")
+            logger.info(f"Markdown saved to {output_file}")
         except Exception as e:
-            print(f"Error saving Markdown to file: {e}")
+            logger.exception(f"Error saving Markdown to file: {e}")
     
     return markdown_content
\ No newline at end of file
diff --git a/defs/etl_fetch.py b/defs/etl_fetch.py
index 60f0a59..2c080b9 100644
--- a/defs/etl_fetch.py
+++ b/defs/etl_fetch.py
@@ -7,13 +7,14 @@
 
 import asyncio
 from crawl4ai import *
+import logging
 
+logger = logging.getLogger(__name__)
 
 async def fetch_resources(source):
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(
-            # url="https://www.waterqualitydata.us"
-            url="https://www.hydrosheds.org/hydrosheds-core-downloads",
+            url=source,
         )
 
-    print(result.markdown)
\ No newline at end of file
+    logger.info(result.markdown)
\ No newline at end of file
diff --git a/defs/etl_query.py b/defs/etl_query.py
index f7a74f3..cffefe8 100644
--- a/defs/etl_query.py
+++ b/defs/etl_query.py
@@ -4,11 +4,13 @@
 import lancedb
 import polars as pl
 import requests
+import logging
 
+logger = logging.getLogger(__name__)
 
 def query_mode(source, sink, query, table):
     """Handle query mode operations"""
-    print(f"Query mode: Processing data from {source} to {sink}")
+    logger.info(f"Query mode: Processing data from {source} to {sink}")
     # Add query-specific logic here
 
     # Qlever params, not needed for oxigraph
@@ -30,13 +32,25 @@ def query_mode(source, sink, query, table):
     response = requests.post(source, params=params, headers=headers, data=query)
 
     # Load response into Polars DataFrame
-    # df = pl.read_csv(StringIO(response.text))
-    df = pl.read_csv(StringIO(response.text), truncate_ragged_lines=False)
-
-    # print(df)
+    try:
+        # df = pl.read_csv(StringIO(response.text))
+        df = pl.read_csv(StringIO(response.text), truncate_ragged_lines=False)
+    except pl.exceptions.ShapeError as e:
+        logger.exception(
+            "Failed to parse CSV response from SPARQL query due to ragged lines. "
+            "The number of columns is inconsistent across rows. "
+            f"Polars error: {e}"
+        )
+        # Re-raise the exception as per requirement
+        raise
+    except Exception as e:
+        logger.exception(f"An unexpected error occurred while parsing CSV response: {e}")
+        raise
+
+    # logger.info(df)
 
     # # Create or get LanceDB table and write data
-    print("Saving LanceDB table: ", table, "")
+    logger.info(f"Saving LanceDB table: {table}")
     db = lancedb.connect("./stores/lancedb")
     tbl = db.create_table(table, data=df, mode="overwrite")
-    print(tbl)
+    logger.info(tbl)
diff --git a/scripts/loadDirToTriplestore.sh b/scripts/loadDirToTriplestore.sh
index b6f1fcd..6045907 100644
--- a/scripts/loadDirToTriplestore.sh
+++ b/scripts/loadDirToTriplestore.sh
@@ -1,17 +1,67 @@
 #!/bin/bash
+set -e # Exit on error
+
+# Check if jsonld command exists
+if ! command -v jsonld &> /dev/null; then
+    echo "Error: jsonld command not found. Please install it." >&2
+    exit 1
+fi
+
+# Check if curl command exists
+if ! command -v curl &> /dev/null; then
+    echo "Error: curl command not found. Please install it." >&2
+    exit 1
+fi
+
 # A wrapper script for loading RDF from a directory into a triplestore
 # Usage
 # ./jsonldDirLoader.sh ./jsonld/depth_strict http://nas.lan:49153/blazegraph/namespace/kb/sparql
 
 mc_cmd() {
-    find $1 -type f  # kinda basic, might add a filter to it
+    find "$1" -type f  # kinda basic, might add a filter to it
 }
 
 # If you use this for ntriples, be sure to compute and/or add in a graph in the URL target
-for i in $(mc_cmd $1); do
+for i in $(mc_cmd "$1"); do
     echo "-------------start-------------"
-    echo Next: $i
-    cat $i | jsonld format -q | curl -X POST -H 'Content-Type:text/x-nquads' --data-binary  @- $2
+    echo "Processing: $i"
+    
+    # Attempt to format the file with jsonld
+    # Capture stdout and stderr separately for jsonld
+    jsonld_output=$(cat "$i" | jsonld format -q 2> /tmp/jsonld_error.log)
+    jsonld_exit_code=$? 
+
+    if [ $jsonld_exit_code -ne 0 ]; then
+        echo "Error formatting $i with jsonld (exit code: $jsonld_exit_code). Skipping." >&2
+        # Print jsonld error log if it exists
+        if [ -s /tmp/jsonld_error.log ]; then
+            cat /tmp/jsonld_error.log >&2
+        fi
+        rm -f /tmp/jsonld_error.log # Clean up error log
+        echo "-------------done (failed formatting)--------------" >&2
+        continue
+    fi
+    rm -f /tmp/jsonld_error.log # Clean up error log if successful
+    
+    # Attempt to upload the formatted data
+    # Capture stdout and stderr separately for curl
+    curl_output=$(echo "$jsonld_output" | curl -X POST -H 'Content-Type:text/x-nquads' --data-binary @- "$2" 2> /tmp/curl_error.log)
+    curl_exit_code=$?
+
+    if [ $curl_exit_code -ne 0 ]; then
+        echo "Error uploading $i to $2 (exit code: $curl_exit_code). Skipping." >&2
+        # Print curl error log if it exists
+        if [ -s /tmp/curl_error.log ]; then
+            cat /tmp/curl_error.log >&2
+        fi
+        rm -f /tmp/curl_error.log # Clean up error log
+        echo "-------------done (failed upload)--------------" >&2
+        continue
+    fi
+    rm -f /tmp/curl_error.log # Clean up error log if successful
+    
+    echo "Successfully processed and uploaded $i"
+    # Optionally print curl output if needed for success confirmation
+    # echo "Server response: $curl_output"
     echo "-------------done--------------"
 done
-
diff --git a/scripts/loadSitemapToTriplestore.sh b/scripts/loadSitemapToTriplestore.sh
index 3cce2ac..9d35bc6 100755
--- a/scripts/loadSitemapToTriplestore.sh
+++ b/scripts/loadSitemapToTriplestore.sh
@@ -1,12 +1,26 @@
 #!/bin/bash
+set -e # Exit on error
+
+# Check if jsonld command exists
+if ! command -v jsonld &> /dev/null; then
+    echo "Error: jsonld command not found. Please install it." >&2
+    exit 1
+fi
+
+# Check if curl command exists
+if ! command -v curl &> /dev/null; then
+    echo "Error: curl command not found. Please install it." >&2
+    exit 1
+fi
+
 # A script for loading JSON-LD from a sitemap into a triplestore
 # Usage
 # ./loadSitemapToTriplestore.sh ../output/jld-sitemap.xml  http://homelab.lan:7878/store
 
 # Check if required parameters are provided
 if [ $# -ne 2 ]; then
-    echo "Usage: $0 <sitemap_file> <triplestore_endpoint>"
-    echo "Example: $0 ../output/jld-sitemap.xml  http://homelab.lan:7878/store"
+    echo "Usage: $0 <sitemap_file> <triplestore_endpoint>" >&2
+    echo "Example: $0 ../output/jld-sitemap.xml  http://homelab.lan:7878/store" >&2
     exit 1
 fi
 
@@ -15,7 +29,7 @@ TRIPLESTORE_ENDPOINT=$2
 
 # Check if the sitemap file exists
 if [ ! -f "$SITEMAP_FILE" ]; then
-    echo "Error: Sitemap file '$SITEMAP_FILE' not found."
+    echo "Error: Sitemap file '$SITEMAP_FILE' not found." >&2
     exit 1
 fi
 
@@ -29,26 +43,80 @@ URLS=$(grep -o '<loc>.*</loc>' "$SITEMAP_FILE" | sed 's/<loc>\(.*\)<\/loc>/\1/')
 
 # Process each URL
 count=0
-total=$(echo "$URLS" | wc -l)
-echo "Found $total URLs to process"
+processed_count=0
+failed_downloads=0
+failed_formatting=0
+failed_uploads=0
+
+total_urls=$(echo "$URLS" | wc -l | xargs) # xargs to trim whitespace
+
+echo "Found $total_urls URLs to process"
 
 for url in $URLS; do
     count=$((count + 1))
     echo "-------------start-------------"
-    echo "[$count/$total] Processing: $url"
+    echo "[$count/$total_urls] Processing: $url"
     
     # Download the content
     echo "Downloading content..."
-    TEMP_FILE="$TEMP_DIR/$(basename "$url")"
-    curl -s "$url" -o "$TEMP_FILE"
+    # Add timestamp to basename to avoid name clashes if sitemap has duplicate basenames
+    TEMP_FILE="$TEMP_DIR/$(basename "$url")_$(date +%s%N)" 
+    
+    curl -s -L "$url" -o "$TEMP_FILE" # Added -L to follow redirects
+    curl_download_exit_code=$?
+
+    if [ $curl_download_exit_code -ne 0 ]; then
+        echo "Error downloading content from $url (exit code: $curl_download_exit_code). Skipping." >&2
+        echo "-------------done (failed download)--------------" >&2
+        failed_downloads=$((failed_downloads + 1))
+        continue
+    fi
     
     if [ -s "$TEMP_FILE" ]; then
-        # Process and upload to triplestore
         echo "Processing and uploading to triplestore..."
-        cat "$TEMP_FILE" | jsonld format -q | curl -X POST -H 'Content-Type:text/x-nquads' --data-binary @- "$TRIPLESTORE_ENDPOINT"
-        echo "Upload complete."
+        
+        # Attempt to format the file with jsonld
+        # Capture stdout and stderr separately for jsonld
+        jsonld_output=$(cat "$TEMP_FILE" | jsonld format -q 2> /tmp/sitemap_jsonld_error.log)
+        jsonld_exit_code=$?
+
+        if [ $jsonld_exit_code -ne 0 ]; then
+            echo "Error formatting content from $url with jsonld (exit code: $jsonld_exit_code). Skipping." >&2
+            if [ -s /tmp/sitemap_jsonld_error.log ]; then
+                cat /tmp/sitemap_jsonld_error.log >&2
+            fi
+            rm -f /tmp/sitemap_jsonld_error.log
+            echo "-------------done (failed formatting)--------------" >&2
+            failed_formatting=$((failed_formatting + 1))
+            continue
+        fi
+        rm -f /tmp/sitemap_jsonld_error.log # Clean up error log if successful
+        
+        # Attempt to upload the formatted data
+        # Capture stdout and stderr separately for curl
+        curl_upload_output=$(echo "$jsonld_output" | curl -X POST -H 'Content-Type:text/x-nquads' --data-binary @- "$TRIPLESTORE_ENDPOINT" 2> /tmp/sitemap_curl_error.log)
+        curl_upload_exit_code=$?
+
+        if [ $curl_upload_exit_code -ne 0 ]; then
+            echo "Error uploading content from $url to $TRIPLESTORE_ENDPOINT (exit code: $curl_upload_exit_code). Skipping." >&2
+            if [ -s /tmp/sitemap_curl_error.log ]; then
+                cat /tmp/sitemap_curl_error.log >&2
+            fi
+            rm -f /tmp/sitemap_curl_error.log
+            echo "-------------done (failed upload)--------------" >&2
+            failed_uploads=$((failed_uploads + 1))
+            continue
+        fi
+        rm -f /tmp/sitemap_curl_error.log # Clean up error log if successful
+        
+        echo "Upload complete for $url."
+        # Optionally print curl output for success
+        # echo "Server response: $curl_upload_output"
+        processed_count=$((processed_count + 1))
     else
-        echo "Error: Failed to download content from $url"
+        echo "Error: Failed to download content from $url (empty file received). Skipping." >&2
+        echo "-------------done (empty download)--------------" >&2
+        failed_downloads=$((failed_downloads + 1))
     fi
     
     echo "-------------done--------------"
@@ -57,4 +125,15 @@ done
 # Clean up
 echo "Cleaning up temporary files..."
 rm -rf "$TEMP_DIR"
-echo "All done! Processed $count URLs."
\ No newline at end of file
+echo "All done!"
+echo "Summary:"
+echo "Total URLs found: $total_urls"
+echo "Successfully processed and uploaded: $processed_count"
+echo "Failed downloads: $failed_downloads"
+echo "Failed formatting (jsonld): $failed_formatting"
+echo "Failed uploads (curl): $failed_uploads"
+
+# Exit with a non-zero code if any failures occurred for easier scripting
+if [ $((failed_downloads + failed_formatting + failed_uploads)) -gt 0 ]; then
+    exit 1
+fi
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_etl_convert.py b/tests/test_etl_convert.py
new file mode 100644
index 0000000..144506b
--- /dev/null
+++ b/tests/test_etl_convert.py
@@ -0,0 +1,97 @@
+import unittest
+import os
+import shutil # For cleaning up the fixtures directory
+from defs.etl_convert import convert_document, html_to_markdown
+
+# Attempt to import reportlab, which is needed for creating a dummy PDF
+try:
+    from reportlab.pdfgen import canvas
+    from reportlab.lib.pagesizes import letter
+    REPORTLAB_AVAILABLE = True
+except ImportError:
+    REPORTLAB_AVAILABLE = False
+
+class TestEtlConvert(unittest.TestCase):
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.base_dir = os.path.dirname(__file__)
+        self.fixtures_dir = os.path.join(self.base_dir, "fixtures")
+        if not os.path.exists(self.fixtures_dir):
+            os.makedirs(self.fixtures_dir)
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        if os.path.exists(self.fixtures_dir):
+            shutil.rmtree(self.fixtures_dir)
+
+    def test_html_to_markdown_conversion(self):
+        """Test direct HTML to Markdown conversion using html_to_markdown."""
+        html_content = "<h1>Test Heading</h1><p>This is a test paragraph.</p>"
+        # html2text uses # for H1
+        expected_markdown = "# Test Heading\n\nThis is a test paragraph."
+        markdown_content = html_to_markdown(html_content)
+        # Normalize newlines and strip whitespace for comparison
+        normalized_markdown = "\n".join(line.strip() for line in markdown_content.strip().splitlines() if line.strip())
+        normalized_expected = "\n".join(line.strip() for line in expected_markdown.strip().splitlines() if line.strip())
+        self.assertEqual(normalized_markdown, normalized_expected)
+
+    def test_convert_document_html_local_file(self):
+        """Test converting a local HTML file to Markdown using convert_document."""
+        dummy_html_path = os.path.join(self.fixtures_dir, "dummy.html")
+        html_content = "<html><body><h1>Local HTML Test</h1><p>A paragraph here.</p></body></html>"
+        # html2text uses # for H1
+        expected_markdown_fragment = "# Local HTML Test\n\nA paragraph here."
+        
+        with open(dummy_html_path, "w", encoding="utf-8") as f:
+            f.write(html_content)
+
+        markdown_content = convert_document(local_file=dummy_html_path)
+        self.assertIsNotNone(markdown_content)
+        # Normalize for comparison
+        normalized_markdown = "\n".join(line.strip() for line in markdown_content.strip().splitlines() if line.strip())
+        normalized_expected = "\n".join(line.strip() for line in expected_markdown_fragment.strip().splitlines() if line.strip())
+        self.assertIn(normalized_expected, normalized_markdown)
+
+    @unittest.skipIf(not REPORTLAB_AVAILABLE, "reportlab is not installed, skipping PDF test.")
+    def test_convert_document_pdf_local_file(self):
+        """Test converting a local PDF file to Markdown using convert_document."""
+        dummy_pdf_path = os.path.join(self.fixtures_dir, "dummy.pdf")
+        pdf_text_content = "This is a test PDF document for conversion."
+
+        # Create a dummy PDF using reportlab
+        c = canvas.Canvas(dummy_pdf_path, pagesize=letter)
+        c.drawString(100, 750, pdf_text_content)
+        c.save()
+
+        self.assertTrue(os.path.exists(dummy_pdf_path), "Dummy PDF file was not created.")
+
+        markdown_content = convert_document(local_file=dummy_pdf_path)
+        self.assertIsNotNone(markdown_content, "Markdown conversion returned None.")
+        
+        # PyPDF2 adds "## Page X" headers. We need to check if the text content is present.
+        # The exact output can vary, so we check for the presence of the core text.
+        self.assertIn(pdf_text_content, markdown_content, 
+                      f"Expected text '{pdf_text_content}' not found in converted Markdown:\n{markdown_content}")
+
+    def test_convert_document_no_source(self):
+        """Test convert_document with no source provided."""
+        # Expecting it to print an error and return None
+        with unittest.mock.patch('builtins.print') as mocked_print:
+            result = convert_document()
+            self.assertIsNone(result)
+            mocked_print.assert_any_call("Error: Please provide either a URL or a local file.")
+
+    def test_convert_document_both_sources(self):
+        """Test convert_document with both URL and local_file provided."""
+        dummy_html_path = os.path.join(self.fixtures_dir, "dummy_both.html")
+        with open(dummy_html_path, "w") as f:
+            f.write("<p>test</p>")
+        
+        with unittest.mock.patch('builtins.print') as mocked_print:
+            result = convert_document(url="http://example.com", local_file=dummy_html_path)
+            self.assertIsNone(result)
+            mocked_print.assert_any_call("Error: Please provide either a URL or a local file, not both.")
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_etl_fetch.py b/tests/test_etl_fetch.py
new file mode 100644
index 0000000..3e5fc23
--- /dev/null
+++ b/tests/test_etl_fetch.py
@@ -0,0 +1,29 @@
+import asyncio
+import unittest
+from unittest.mock import AsyncMock, patch
+
+# Adjust the import path based on your project structure if necessary
+# This assumes that the 'defs' directory is in the PYTHONPATH
+# or that tests are run from the root directory.
+from defs.etl_fetch import fetch_resources
+
+class TestEtlFetch(unittest.IsolatedAsyncioTestCase): # Changed to IsolatedAsyncioTestCase
+
+    @patch('defs.etl_fetch.AsyncWebCrawler')
+    async def test_fetch_resources_uses_source_url(self, MockAsyncWebCrawler):
+        # Configure the mock
+        mock_crawler_instance = MockAsyncWebCrawler.return_value
+        # For async context manager __aenter__ and __aexit__ needs to be AsyncMock
+        mock_crawler_instance.__aenter__ = AsyncMock(return_value=mock_crawler_instance)
+        mock_crawler_instance.__aexit__ = AsyncMock(return_value=None) 
+        
+        # arun should also be an AsyncMock if it's awaited
+        mock_crawler_instance.arun = AsyncMock(return_value=type('obj', (object,), {'markdown': 'mocked markdown'})()) 
+
+        test_url = "http://example.com/test_source"
+        
+        await fetch_resources(test_url)
+
+        mock_crawler_instance.arun.assert_called_once_with(url=test_url)
+
+# Removed the if __name__ == '__main__': block to rely on test discovery