From e6b446a08e1c66c12c3103f359326863bdfa8820 Mon Sep 17 00:00:00 2001 From: Jonathan Moore Date: Tue, 2 Dec 2025 15:52:20 +0100 Subject: [PATCH 1/3] Support multiple urls and an overall summary --- README.md | 22 +++++--- scrape2json.js | 145 ++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 139 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 02950e0..f26f983 100644 --- a/README.md +++ b/README.md @@ -25,22 +25,30 @@ SCRAPEGRAPH_API_KEY=your-api-key-here ## Usage ```bash -node scrape2json.js +node scrape2json.js [url2] [url3] ... ``` +Accepts multiple URLs (mix of PDFs and web pages). Processes each, then generates a meta summary. + ### Examples ```bash -# Extract from PDF +# Single document node scrape2json.js https://dserver.bundestag.de/brd/2025/0204-25.pdf -# Extract from webpage -node scrape2json.js https://www.bundestag.de/dokumente/textarchiv/2025/kw42-de-rente-1115416 +# Multiple documents +node scrape2json.js \ + https://dserver.bundestag.de/brd/2025/0204-25.pdf \ + https://www.bundestag.de/dokumente/textarchiv/2025/kw42-de-rente-1115416 ``` ## Output Returns JSON with: -- `date` — relevant implementation date -- `title` — document title -- `summary` — one paragraph summary in English +- `title` — generated title for the collection +- `summary` — meta-summary synthesizing all documents +- `items` — array of documents sorted by date, each with: + - `date` — relevant implementation date + - `title` — document title + - `summary` — one paragraph summary + - `url` — source URL diff --git a/scrape2json.js b/scrape2json.js index 8794c40..4691acc 100644 --- a/scrape2json.js +++ b/scrape2json.js @@ -10,16 +10,15 @@ if (!apiKey) { process.exit(1); } -// Get URL from command line argument -const inputUrl = process.argv[2]; -if (!inputUrl) { - console.error('Usage: node scrape2json.js '); - console.error('Example: node scrape2json.js https://example.com/document.pdf'); - console.error(' node scrape2json.js https://example.com/page.html'); +// Get URLs from command line arguments +const inputUrls = process.argv.slice(2); +if (inputUrls.length === 0) { + console.error('Usage: node scrape2json.js [url2] [url3] ...'); + console.error('Example: node scrape2json.js https://example.com/doc.pdf https://example.com/page.html'); process.exit(1); } -const prompt = `Extract the following fields: +const itemPrompt = `Extract the following fields: - title - date (relevant to the implementation) @@ -32,6 +31,16 @@ Return JSON with following fields: - title - summary`; +const metaPrompt = `Given these document summaries, generate: + +1. A concise title that describes the overall collection +2. A meta-summary (1 paragraph) synthesizing the key themes across all documents + +Return JSON with following fields: + +- title +- summary`; + /** * Detect if a URL points to a PDF based on URL extension or content-type */ @@ -49,7 +58,6 @@ function isPdfUrl(url, contentType) { * Convert PDF buffer to Markdown */ async function pdfToMarkdown(buffer) { - console.log('Converting PDF to Markdown...'); const markdown = await pdf2md(buffer); return markdown; } @@ -58,7 +66,6 @@ async function pdfToMarkdown(buffer) { * Convert HTML to Markdown using Turndown */ function htmlToMarkdown(html) { - console.log('Converting HTML to Markdown...'); const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' @@ -79,27 +86,22 @@ async function urlToMarkdown(url) { const contentType = response.headers.get('content-type') || ''; if (isPdfUrl(url, contentType)) { - // Handle PDF + console.log(' Converting PDF to Markdown...'); const arrayBuffer = await response.arrayBuffer(); const pdfBuffer = Buffer.from(arrayBuffer); return await pdfToMarkdown(pdfBuffer); } else { - // Handle HTML + console.log(' Converting HTML to Markdown...'); const html = await response.text(); return htmlToMarkdown(html); } } /** - * Main function: fetch URL, convert to markdown, extract data with smartScraper + * Extract data from markdown using smartScraper */ -async function scrapeUrlToJson(url) { - const markdown = await urlToMarkdown(url); - console.log(`Converted to ${markdown.length} characters of Markdown`); - - // Pass markdown to smartScraper - console.log('Extracting data with smartScraper...'); - const result = await smartScraper( +async function extractWithSmartScraper(markdown, prompt) { + return await smartScraper( apiKey, null, // url (null since we're using websiteMarkdown) prompt, @@ -114,12 +116,113 @@ async function scrapeUrlToJson(url) { null, // websiteHtml markdown // websiteMarkdown ); +} + +/** + * Process a single URL: fetch, convert to markdown, extract data + */ +async function scrapeUrlToJson(url) { + const markdown = await urlToMarkdown(url); + console.log(` Converted to ${markdown.length} characters of Markdown`); - return result; + console.log(' Extracting data...'); + const result = await extractWithSmartScraper(markdown, itemPrompt); + + return { ...result, url }; +} + +/** + * Parse date string to Date object for sorting + */ +function parseDate(dateStr) { + if (!dateStr) return new Date(0); + const parsed = new Date(dateStr); + return isNaN(parsed.getTime()) ? new Date(0) : parsed; +} + +/** + * Extract the actual result data from API response + */ +function unwrapResult(apiResponse) { + // smartScraper returns { result: { ... }, request_id: ... } + return apiResponse?.result || apiResponse; +} + +/** + * Generate meta summary from all items + */ +async function generateMetaSummary(items) { + const summariesMarkdown = items + .map((item, i) => { + const data = unwrapResult(item); + return `## Document ${i + 1}: ${data.title || 'Untitled'}\n\nDate: ${data.date || 'Unknown'}\n\n${data.summary || 'No summary'}`; + }) + .join('\n\n---\n\n'); + + console.log('\nGenerating meta summary...'); + console.log('Summaries being sent:\n' + summariesMarkdown.substring(0, 500) + '...\n'); + return await extractWithSmartScraper(summariesMarkdown, metaPrompt); +} + +/** + * Main function: process multiple URLs and generate combined output + */ +async function scrapeMultipleUrls(urls) { + // Process each URL sequentially to avoid rate limits + const items = []; + for (let i = 0; i < urls.length; i++) { + console.log(`\n[${i + 1}/${urls.length}] Processing: ${urls[i]}`); + try { + const result = await scrapeUrlToJson(urls[i]); + items.push(result); + } catch (error) { + console.error(` Error processing ${urls[i]}: ${error.message}`); + items.push({ + url: urls[i], + title: null, + date: null, + summary: null, + error: error.message + }); + } + } + + // Sort items by date (unwrap result to access date field) + const sortedItems = items.sort((a, b) => { + const dateA = unwrapResult(a).date; + const dateB = unwrapResult(b).date; + return parseDate(dateA).getTime() - parseDate(dateB).getTime(); + }); + + // Generate meta summary + const validItems = sortedItems.filter(item => !item.error); + const meta = validItems.length > 0 + ? await generateMetaSummary(validItems) + : { title: 'No valid documents', summary: 'All documents failed to process.' }; + + // Normalize items to flatten the result structure + const normalizedItems = sortedItems.map(item => { + if (item.error) return item; + const data = unwrapResult(item); + return { + url: item.url, + title: data.title, + date: data.date, + summary: data.summary + }; + }); + + const metaData = unwrapResult(meta); + return { + title: metaData.title, + summary: metaData.summary, + items: normalizedItems + }; } try { - const result = await scrapeUrlToJson(inputUrl); + const result = await scrapeMultipleUrls(inputUrls); + console.log('\n' + '='.repeat(60)); console.log('Result:', JSON.stringify(result, null, 2)); } catch (error) { console.error('Error:', error); From 2ca00b7920b38466fd458418c6412a4707deac21 Mon Sep 17 00:00:00 2001 From: Jonathan Moore Date: Tue, 2 Dec 2025 16:22:28 +0100 Subject: [PATCH 2/3] convert to use openai --- .env.example | 5 +- README.md | 11 ++-- scrape2json.js | 148 ++++++++++++++++++++++++++----------------------- 3 files changed, 91 insertions(+), 73 deletions(-) diff --git a/.env.example b/.env.example index 9dab986..07058de 100644 --- a/.env.example +++ b/.env.example @@ -1 +1,4 @@ -SCRAPEGRAPH_API_KEY=your-api-key-here +AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com +AZURE_OPENAI_API_KEY=your-api-key +AZURE_OPENAI_DEPLOYMENT=your-deployment-name +AZURE_OPENAI_API_VERSION=2024-02-15-preview diff --git a/README.md b/README.md index f26f983..c5734bd 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # scrape2json -Extract structured data from PDFs and web pages using AI. +Extract structured data from PDFs and web pages using Azure OpenAI. ## Installation @@ -10,16 +10,19 @@ pnpm install ## Configuration -Copy `.env.example` to `.env` and add your ScrapeGraph API key: +Copy `.env.example` to `.env` and configure your Azure OpenAI settings: ```bash cp .env.example .env ``` -Then edit `.env`: +Required environment variables: ``` -SCRAPEGRAPH_API_KEY=your-api-key-here +AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com +AZURE_OPENAI_API_KEY=your-api-key +AZURE_OPENAI_DEPLOYMENT=your-deployment-name +AZURE_OPENAI_API_VERSION=2024-02-15-preview # optional ``` ## Usage diff --git a/scrape2json.js b/scrape2json.js index 4691acc..8d7776a 100644 --- a/scrape2json.js +++ b/scrape2json.js @@ -1,12 +1,18 @@ import 'dotenv/config'; -import { smartScraper } from 'scrapegraph-js'; import pdf2md from '@opendocsg/pdf2md'; import TurndownService from 'turndown'; -const apiKey = process.env.SCRAPEGRAPH_API_KEY; -if (!apiKey) { - console.error('Error: SCRAPEGRAPH_API_KEY environment variable is required'); - console.error('Copy .env.example to .env and add your API key'); +// Azure OpenAI configuration +const endpoint = process.env.AZURE_OPENAI_ENDPOINT; +const apiKey = process.env.AZURE_OPENAI_API_KEY; +const deployment = process.env.AZURE_OPENAI_DEPLOYMENT; +const apiVersion = process.env.AZURE_OPENAI_API_VERSION || '2024-02-15-preview'; + +if (!endpoint || !apiKey || !deployment) { + console.error('Error: Azure OpenAI environment variables are required'); + console.error('Required: AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, AZURE_OPENAI_DEPLOYMENT'); + console.error('Optional: AZURE_OPENAI_API_VERSION (default: 2024-02-15-preview)'); + console.error('Copy .env.example to .env and configure your Azure OpenAI settings'); process.exit(1); } @@ -18,28 +24,67 @@ if (inputUrls.length === 0) { process.exit(1); } -const itemPrompt = `Extract the following fields: +const itemSystemPrompt = `You are a document analyzer. Extract structured data from the provided document and return valid JSON only.`; -- title -- date (relevant to the implementation) +const itemUserPrompt = `Extract the following fields from this document: -Generate a summary of no more than 1 paragraph. +- title: The document title +- date: The date relevant to the implementation (in ISO format YYYY-MM-DD if possible) +- summary: A concise summary of no more than 1 paragraph -Return JSON with following fields: +Return ONLY valid JSON with these fields: date, title, summary -- date -- title -- summary`; +Document content: +`; -const metaPrompt = `Given these document summaries, generate: +const metaSystemPrompt = `You are a document collection analyzer. Synthesize information from multiple document summaries and return valid JSON only.`; + +const metaUserPrompt = `Given these document summaries, generate: 1. A concise title that describes the overall collection 2. A meta-summary (1 paragraph) synthesizing the key themes across all documents -Return JSON with following fields: +Return ONLY valid JSON with these fields: title, summary + +Document summaries: +`; + +/** + * Call Azure OpenAI chat completion API + */ +async function callAzureOpenAI(systemPrompt, userContent) { + const url = `${endpoint}/openai/deployments/${deployment}/chat/completions?api-version=${apiVersion}`; + + const response = await fetch(url, { + method: 'POST', + headers: { + 'api-key': apiKey, + 'Content-Type': 'application/json' + }, + body: JSON.stringify({ + messages: [ + { role: 'system', content: systemPrompt }, + { role: 'user', content: userContent } + ], + response_format: { type: 'json_object' }, + temperature: 0.3 + }) + }); + + if (!response.ok) { + const error = await response.text(); + throw new Error(`Azure OpenAI API error: ${response.status} - ${error}`); + } + + const data = await response.json(); + const content = data.choices?.[0]?.message?.content; + + if (!content) { + throw new Error('No content in Azure OpenAI response'); + } -- title -- summary`; + return JSON.parse(content); +} /** * Detect if a URL points to a PDF based on URL extension or content-type @@ -98,24 +143,16 @@ async function urlToMarkdown(url) { } /** - * Extract data from markdown using smartScraper + * Extract data from markdown using Azure OpenAI */ -async function extractWithSmartScraper(markdown, prompt) { - return await smartScraper( - apiKey, - null, // url (null since we're using websiteMarkdown) - prompt, - null, // schema - null, // numberOfScrolls - null, // totalPages - null, // cookies - {}, // options - false, // plain_text - false, // renderHeavyJs - false, // stealth - null, // websiteHtml - markdown // websiteMarkdown - ); +async function extractFromMarkdown(markdown, systemPrompt, userPromptPrefix) { + // Truncate markdown if too long (leave room for prompts) + const maxLength = 100000; + const truncatedMarkdown = markdown.length > maxLength + ? markdown.substring(0, maxLength) + '\n\n[Content truncated...]' + : markdown; + + return await callAzureOpenAI(systemPrompt, userPromptPrefix + truncatedMarkdown); } /** @@ -126,7 +163,7 @@ async function scrapeUrlToJson(url) { console.log(` Converted to ${markdown.length} characters of Markdown`); console.log(' Extracting data...'); - const result = await extractWithSmartScraper(markdown, itemPrompt); + const result = await extractFromMarkdown(markdown, itemSystemPrompt, itemUserPrompt); return { ...result, url }; } @@ -140,28 +177,18 @@ function parseDate(dateStr) { return isNaN(parsed.getTime()) ? new Date(0) : parsed; } -/** - * Extract the actual result data from API response - */ -function unwrapResult(apiResponse) { - // smartScraper returns { result: { ... }, request_id: ... } - return apiResponse?.result || apiResponse; -} - /** * Generate meta summary from all items */ async function generateMetaSummary(items) { const summariesMarkdown = items - .map((item, i) => { - const data = unwrapResult(item); - return `## Document ${i + 1}: ${data.title || 'Untitled'}\n\nDate: ${data.date || 'Unknown'}\n\n${data.summary || 'No summary'}`; - }) + .map((item, i) => `## Document ${i + 1}: ${item.title || 'Untitled'}\n\nDate: ${item.date || 'Unknown'}\n\n${item.summary || 'No summary'}`) .join('\n\n---\n\n'); console.log('\nGenerating meta summary...'); console.log('Summaries being sent:\n' + summariesMarkdown.substring(0, 500) + '...\n'); - return await extractWithSmartScraper(summariesMarkdown, metaPrompt); + + return await extractFromMarkdown(summariesMarkdown, metaSystemPrompt, metaUserPrompt); } /** @@ -187,11 +214,9 @@ async function scrapeMultipleUrls(urls) { } } - // Sort items by date (unwrap result to access date field) + // Sort items by date const sortedItems = items.sort((a, b) => { - const dateA = unwrapResult(a).date; - const dateB = unwrapResult(b).date; - return parseDate(dateA).getTime() - parseDate(dateB).getTime(); + return parseDate(a.date).getTime() - parseDate(b.date).getTime(); }); // Generate meta summary @@ -200,23 +225,10 @@ async function scrapeMultipleUrls(urls) { ? await generateMetaSummary(validItems) : { title: 'No valid documents', summary: 'All documents failed to process.' }; - // Normalize items to flatten the result structure - const normalizedItems = sortedItems.map(item => { - if (item.error) return item; - const data = unwrapResult(item); - return { - url: item.url, - title: data.title, - date: data.date, - summary: data.summary - }; - }); - - const metaData = unwrapResult(meta); return { - title: metaData.title, - summary: metaData.summary, - items: normalizedItems + title: meta.title, + summary: meta.summary, + items: sortedItems }; } From 30bd4b9c68c38e3d75c740498ff71d81a1ae3cf6 Mon Sep 17 00:00:00 2001 From: Jonathan Moore Date: Wed, 3 Dec 2025 11:02:56 +0100 Subject: [PATCH 3/3] configuratble temperature, throttling, and character limit for markdown --- .env.example | 3 +++ scrape2json.js | 56 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/.env.example b/.env.example index 07058de..d362024 100644 --- a/.env.example +++ b/.env.example @@ -2,3 +2,6 @@ AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com AZURE_OPENAI_API_KEY=your-api-key AZURE_OPENAI_DEPLOYMENT=your-deployment-name AZURE_OPENAI_API_VERSION=2024-02-15-preview +AZURE_OPENAI_TEMPERATURE=0.2 +AZURE_OPENAI_THROTTLE_MS=5000 +MAX_MARKDOWN_LENGTH=20000 \ No newline at end of file diff --git a/scrape2json.js b/scrape2json.js index 8d7776a..499cd6e 100644 --- a/scrape2json.js +++ b/scrape2json.js @@ -7,15 +7,28 @@ const endpoint = process.env.AZURE_OPENAI_ENDPOINT; const apiKey = process.env.AZURE_OPENAI_API_KEY; const deployment = process.env.AZURE_OPENAI_DEPLOYMENT; const apiVersion = process.env.AZURE_OPENAI_API_VERSION || '2024-02-15-preview'; +const temperature = parseFloat(process.env.AZURE_OPENAI_TEMPERATURE) || 0.2; +const throttleMs = parseInt(process.env.AZURE_OPENAI_THROTTLE_MS) || 5000; // 5 seconds default +const maxMarkdownLength = parseInt(process.env.MAX_MARKDOWN_LENGTH) || 50000; // ~50k chars default if (!endpoint || !apiKey || !deployment) { console.error('Error: Azure OpenAI environment variables are required'); console.error('Required: AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, AZURE_OPENAI_DEPLOYMENT'); console.error('Optional: AZURE_OPENAI_API_VERSION (default: 2024-02-15-preview)'); + console.error('Optional: AZURE_OPENAI_TEMPERATURE (default: 0.2)'); + console.error('Optional: AZURE_OPENAI_THROTTLE_MS (default: 5000)'); + console.error('Optional: MAX_MARKDOWN_LENGTH (default: 50000)'); console.error('Copy .env.example to .env and configure your Azure OpenAI settings'); process.exit(1); } +/** + * Sleep for specified milliseconds + */ +function sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); +} + // Get URLs from command line arguments const inputUrls = process.argv.slice(2); if (inputUrls.length === 0) { @@ -24,29 +37,29 @@ if (inputUrls.length === 0) { process.exit(1); } -const itemSystemPrompt = `You are a document analyzer. Extract structured data from the provided document and return valid JSON only.`; +const itemSystemPrompt = `You extract structured data from documents. Be extremely concise. Return valid JSON only.`; -const itemUserPrompt = `Extract the following fields from this document: +const itemUserPrompt = `Extract from this document: -- title: The document title -- date: The date relevant to the implementation (in ISO format YYYY-MM-DD if possible) -- summary: A concise summary of no more than 1 paragraph +- title: Document title +- date: Implementation date (YYYY-MM-DD) +- summary: 2-3 sentences maximum, key points only -Return ONLY valid JSON with these fields: date, title, summary +Return JSON with: date, title, summary -Document content: +Document: `; -const metaSystemPrompt = `You are a document collection analyzer. Synthesize information from multiple document summaries and return valid JSON only.`; +const metaSystemPrompt = `You synthesize document collections. Be extremely concise. Return valid JSON only.`; -const metaUserPrompt = `Given these document summaries, generate: +const metaUserPrompt = `From these summaries, generate: -1. A concise title that describes the overall collection -2. A meta-summary (1 paragraph) synthesizing the key themes across all documents +- title: Brief collection title +- summary: 2-3 sentences synthesizing key themes -Return ONLY valid JSON with these fields: title, summary +Return JSON with: title, summary -Document summaries: +Summaries: `; /** @@ -67,7 +80,7 @@ async function callAzureOpenAI(systemPrompt, userContent) { { role: 'user', content: userContent } ], response_format: { type: 'json_object' }, - temperature: 0.3 + temperature }) }); @@ -83,6 +96,12 @@ async function callAzureOpenAI(systemPrompt, userContent) { throw new Error('No content in Azure OpenAI response'); } + // Throttle to avoid rate limits + if (throttleMs > 0) { + console.log(` Throttling ${throttleMs}ms...`); + await sleep(throttleMs); + } + return JSON.parse(content); } @@ -147,11 +166,14 @@ async function urlToMarkdown(url) { */ async function extractFromMarkdown(markdown, systemPrompt, userPromptPrefix) { // Truncate markdown if too long (leave room for prompts) - const maxLength = 100000; - const truncatedMarkdown = markdown.length > maxLength - ? markdown.substring(0, maxLength) + '\n\n[Content truncated...]' + const truncatedMarkdown = markdown.length > maxMarkdownLength + ? markdown.substring(0, maxMarkdownLength) + '\n\n[Content truncated...]' : markdown; + if (markdown.length > maxMarkdownLength) { + console.log(` Truncated from ${markdown.length} to ${maxMarkdownLength} chars`); + } + return await callAzureOpenAI(systemPrompt, userPromptPrefix + truncatedMarkdown); }