n3xtcoder · jonfm · Dec 2, 2025 · Dec 2, 2025 · Dec 3, 2025
diff --git a/.env.example b/.env.example
@@ -1 +1,7 @@
-SCRAPEGRAPH_API_KEY=your-api-key-here
+AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com
+AZURE_OPENAI_API_KEY=your-api-key
+AZURE_OPENAI_DEPLOYMENT=your-deployment-name
+AZURE_OPENAI_API_VERSION=2024-02-15-preview
+AZURE_OPENAI_TEMPERATURE=0.2
+AZURE_OPENAI_THROTTLE_MS=5000
+MAX_MARKDOWN_LENGTH=20000
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # scrape2json
 
-Extract structured data from PDFs and web pages using AI.
+Extract structured data from PDFs and web pages using Azure OpenAI.
 
 ## Installation
 
@@ -10,37 +10,48 @@ pnpm install
 
 ## Configuration
 
-Copy `.env.example` to `.env` and add your ScrapeGraph API key:
+Copy `.env.example` to `.env` and configure your Azure OpenAI settings:
 
 ```bash
 cp .env.example .env
 ```
 
-Then edit `.env`:
+Required environment variables:
 
 ```
-SCRAPEGRAPH_API_KEY=your-api-key-here
+AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com
+AZURE_OPENAI_API_KEY=your-api-key
+AZURE_OPENAI_DEPLOYMENT=your-deployment-name
+AZURE_OPENAI_API_VERSION=2024-02-15-preview  # optional
 ```
 
 ## Usage
 
 ```bash
-node scrape2json.js <url>
+node scrape2json.js <url> [url2] [url3] ...
 ```
 
+Accepts multiple URLs (mix of PDFs and web pages). Processes each, then generates a meta summary.
+
 ### Examples
 
 ```bash
-# Extract from PDF
+# Single document
 node scrape2json.js https://dserver.bundestag.de/brd/2025/0204-25.pdf
 
-# Extract from webpage
-node scrape2json.js https://www.bundestag.de/dokumente/textarchiv/2025/kw42-de-rente-1115416
+# Multiple documents
+node scrape2json.js \
+  https://dserver.bundestag.de/brd/2025/0204-25.pdf \
+  https://www.bundestag.de/dokumente/textarchiv/2025/kw42-de-rente-1115416
 ```
 
 ## Output
 
 Returns JSON with:
-- `date` — relevant implementation date
-- `title` — document title
-- `summary` — one paragraph summary in English
+- `title` — generated title for the collection
+- `summary` — meta-summary synthesizing all documents
+- `items` — array of documents sorted by date, each with:
+  - `date` — relevant implementation date
+  - `title` — document title
+  - `summary` — one paragraph summary
+  - `url` — source URL
diff --git a/scrape2json.js b/scrape2json.js
@@ -1,36 +1,109 @@
 import 'dotenv/config';
-import { smartScraper } from 'scrapegraph-js';
 import pdf2md from '@opendocsg/pdf2md';
 import TurndownService from 'turndown';
 
-const apiKey = process.env.SCRAPEGRAPH_API_KEY;
-if (!apiKey) {
-  console.error('Error: SCRAPEGRAPH_API_KEY environment variable is required');
-  console.error('Copy .env.example to .env and add your API key');
+// Azure OpenAI configuration
+const endpoint = process.env.AZURE_OPENAI_ENDPOINT;
+const apiKey = process.env.AZURE_OPENAI_API_KEY;
+const deployment = process.env.AZURE_OPENAI_DEPLOYMENT;
+const apiVersion = process.env.AZURE_OPENAI_API_VERSION || '2024-02-15-preview';
+const temperature = parseFloat(process.env.AZURE_OPENAI_TEMPERATURE) || 0.2;
+const throttleMs = parseInt(process.env.AZURE_OPENAI_THROTTLE_MS) || 5000; // 5 seconds default
+const maxMarkdownLength = parseInt(process.env.MAX_MARKDOWN_LENGTH) || 50000; // ~50k chars default
+
+if (!endpoint || !apiKey || !deployment) {
+  console.error('Error: Azure OpenAI environment variables are required');
+  console.error('Required: AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, AZURE_OPENAI_DEPLOYMENT');
+  console.error('Optional: AZURE_OPENAI_API_VERSION (default: 2024-02-15-preview)');
+  console.error('Optional: AZURE_OPENAI_TEMPERATURE (default: 0.2)');
+  console.error('Optional: AZURE_OPENAI_THROTTLE_MS (default: 5000)');
+  console.error('Optional: MAX_MARKDOWN_LENGTH (default: 50000)');
+  console.error('Copy .env.example to .env and configure your Azure OpenAI settings');
   process.exit(1);
 }
 
-// Get URL from command line argument
-const inputUrl = process.argv[2];
-if (!inputUrl) {
-  console.error('Usage: node scrape2json.js <url>');
-  console.error('Example: node scrape2json.js https://example.com/document.pdf');
-  console.error('         node scrape2json.js https://example.com/page.html');
+/**
+ * Sleep for specified milliseconds
+ */
+function sleep(ms) {
+  return new Promise(resolve => setTimeout(resolve, ms));
+}
+
+// Get URLs from command line arguments
+const inputUrls = process.argv.slice(2);
+if (inputUrls.length === 0) {
+  console.error('Usage: node scrape2json.js <url> [url2] [url3] ...');
+  console.error('Example: node scrape2json.js https://example.com/doc.pdf https://example.com/page.html');
   process.exit(1);
 }
 
-const prompt = `Extract the following fields:
+const itemSystemPrompt = `You extract structured data from documents. Be extremely concise. Return valid JSON only.`;
+
+const itemUserPrompt = `Extract from this document:
+
+- title: Document title
+- date: Implementation date (YYYY-MM-DD)
+- summary: 2-3 sentences maximum, key points only
+
+Return JSON with: date, title, summary
+
+Document:
+`;
+
+const metaSystemPrompt = `You synthesize document collections. Be extremely concise. Return valid JSON only.`;
+
+const metaUserPrompt = `From these summaries, generate:
+
+- title: Brief collection title
+- summary: 2-3 sentences synthesizing key themes
+
+Return JSON with: title, summary
 
-- title
-- date (relevant to the implementation)
+Summaries:
+`;
 
-Generate a summary of no more than 1 paragraph.
+/**
+ * Call Azure OpenAI chat completion API
+ */
+async function callAzureOpenAI(systemPrompt, userContent) {
+  const url = `${endpoint}/openai/deployments/${deployment}/chat/completions?api-version=${apiVersion}`;
+
+  const response = await fetch(url, {
+    method: 'POST',
+    headers: {
+      'api-key': apiKey,
+      'Content-Type': 'application/json'
+    },
+    body: JSON.stringify({
+      messages: [
+        { role: 'system', content: systemPrompt },
+        { role: 'user', content: userContent }
+      ],
+      response_format: { type: 'json_object' },
+      temperature
+    })
+  });
 
-Return JSON with following fields:
+  if (!response.ok) {
+    const error = await response.text();
+    throw new Error(`Azure OpenAI API error: ${response.status} - ${error}`);
+  }
 
-- date
-- title
-- summary`;
+  const data = await response.json();
+  const content = data.choices?.[0]?.message?.content;
+
+  if (!content) {
+    throw new Error('No content in Azure OpenAI response');
+  }
+
+  // Throttle to avoid rate limits
+  if (throttleMs > 0) {
+    console.log(`  Throttling ${throttleMs}ms...`);
+    await sleep(throttleMs);
+  }
+
+  return JSON.parse(content);
+}
 
 /**
  * Detect if a URL points to a PDF based on URL extension or content-type
@@ -49,7 +122,6 @@ function isPdfUrl(url, contentType) {
  * Convert PDF buffer to Markdown
  */
 async function pdfToMarkdown(buffer) {
-  console.log('Converting PDF to Markdown...');
   const markdown = await pdf2md(buffer);
   return markdown;
 }
@@ -58,7 +130,6 @@ async function pdfToMarkdown(buffer) {
  * Convert HTML to Markdown using Turndown
  */
 function htmlToMarkdown(html) {
-  console.log('Converting HTML to Markdown...');
   const turndown = new TurndownService({
     headingStyle: 'atx',
     codeBlockStyle: 'fenced'
@@ -79,47 +150,113 @@ async function urlToMarkdown(url) {
   const contentType = response.headers.get('content-type') || '';
 
   if (isPdfUrl(url, contentType)) {
-    // Handle PDF
+    console.log('  Converting PDF to Markdown...');
     const arrayBuffer = await response.arrayBuffer();
     const pdfBuffer = Buffer.from(arrayBuffer);
     return await pdfToMarkdown(pdfBuffer);
   } else {
-    // Handle HTML
+    console.log('  Converting HTML to Markdown...');
     const html = await response.text();
     return htmlToMarkdown(html);
   }
 }
 
 /**
- * Main function: fetch URL, convert to markdown, extract data with smartScraper
+ * Extract data from markdown using Azure OpenAI
+ */
+async function extractFromMarkdown(markdown, systemPrompt, userPromptPrefix) {
+  // Truncate markdown if too long (leave room for prompts)
+  const truncatedMarkdown = markdown.length > maxMarkdownLength 
+    ? markdown.substring(0, maxMarkdownLength) + '\n\n[Content truncated...]'
+    : markdown;
+
+  if (markdown.length > maxMarkdownLength) {
+    console.log(`  Truncated from ${markdown.length} to ${maxMarkdownLength} chars`);
+  }
+
+  return await callAzureOpenAI(systemPrompt, userPromptPrefix + truncatedMarkdown);
+}
+
+/**
+ * Process a single URL: fetch, convert to markdown, extract data
  */
 async function scrapeUrlToJson(url) {
   const markdown = await urlToMarkdown(url);
-  console.log(`Converted to ${markdown.length} characters of Markdown`);
+  console.log(`  Converted to ${markdown.length} characters of Markdown`);
+
+  console.log('  Extracting data...');
+  const result = await extractFromMarkdown(markdown, itemSystemPrompt, itemUserPrompt);
+
+  return { ...result, url };
+}
+
+/**
+ * Parse date string to Date object for sorting
+ */
+function parseDate(dateStr) {
+  if (!dateStr) return new Date(0);
+  const parsed = new Date(dateStr);
+  return isNaN(parsed.getTime()) ? new Date(0) : parsed;
+}
+
+/**
+ * Generate meta summary from all items
+ */
+async function generateMetaSummary(items) {
+  const summariesMarkdown = items
+    .map((item, i) => `## Document ${i + 1}: ${item.title || 'Untitled'}\n\nDate: ${item.date || 'Unknown'}\n\n${item.summary || 'No summary'}`)
+    .join('\n\n---\n\n');
+
+  console.log('\nGenerating meta summary...');
+  console.log('Summaries being sent:\n' + summariesMarkdown.substring(0, 500) + '...\n');
+
+  return await extractFromMarkdown(summariesMarkdown, metaSystemPrompt, metaUserPrompt);
+}
+
+/**
+ * Main function: process multiple URLs and generate combined output
+ */
+async function scrapeMultipleUrls(urls) {
+  // Process each URL sequentially to avoid rate limits
+  const items = [];
+  for (let i = 0; i < urls.length; i++) {
+    console.log(`\n[${i + 1}/${urls.length}] Processing: ${urls[i]}`);
+    try {
+      const result = await scrapeUrlToJson(urls[i]);
+      items.push(result);
+    } catch (error) {
+      console.error(`  Error processing ${urls[i]}: ${error.message}`);
+      items.push({
+        url: urls[i],
+        title: null,
+        date: null,
+        summary: null,
+        error: error.message
+      });
+    }
+  }
+
+  // Sort items by date
+  const sortedItems = items.sort((a, b) => {
+    return parseDate(a.date).getTime() - parseDate(b.date).getTime();
+  });
 
-  // Pass markdown to smartScraper
-  console.log('Extracting data with smartScraper...');
-  const result = await smartScraper(
-    apiKey,
-    null,           // url (null since we're using websiteMarkdown)
-    prompt,
-    null,           // schema
-    null,           // numberOfScrolls
-    null,           // totalPages
-    null,           // cookies
-    {},             // options
-    false,          // plain_text
-    false,          // renderHeavyJs
-    false,          // stealth
-    null,           // websiteHtml
-    markdown        // websiteMarkdown
-  );
+  // Generate meta summary
+  const validItems = sortedItems.filter(item => !item.error);
+  const meta = validItems.length > 0 
+    ? await generateMetaSummary(validItems)
+    : { title: 'No valid documents', summary: 'All documents failed to process.' };
 
-  return result;
+  return {
+    title: meta.title,
+    summary: meta.summary,
+    items: sortedItems
+  };
 }
 
 try {
-  const result = await scrapeUrlToJson(inputUrl);
+  const result = await scrapeMultipleUrls(inputUrls);
+  console.log('\n' + '='.repeat(60));
   console.log('Result:', JSON.stringify(result, null, 2));
 } catch (error) {
   console.error('Error:', error);