diff --git a/.env.example b/.env.example index 9dab986..d362024 100644 --- a/.env.example +++ b/.env.example @@ -1 +1,7 @@ -SCRAPEGRAPH_API_KEY=your-api-key-here +AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com +AZURE_OPENAI_API_KEY=your-api-key +AZURE_OPENAI_DEPLOYMENT=your-deployment-name +AZURE_OPENAI_API_VERSION=2024-02-15-preview +AZURE_OPENAI_TEMPERATURE=0.2 +AZURE_OPENAI_THROTTLE_MS=5000 +MAX_MARKDOWN_LENGTH=20000 \ No newline at end of file diff --git a/README.md b/README.md index 02950e0..c5734bd 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # scrape2json -Extract structured data from PDFs and web pages using AI. +Extract structured data from PDFs and web pages using Azure OpenAI. ## Installation @@ -10,37 +10,48 @@ pnpm install ## Configuration -Copy `.env.example` to `.env` and add your ScrapeGraph API key: +Copy `.env.example` to `.env` and configure your Azure OpenAI settings: ```bash cp .env.example .env ``` -Then edit `.env`: +Required environment variables: ``` -SCRAPEGRAPH_API_KEY=your-api-key-here +AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com +AZURE_OPENAI_API_KEY=your-api-key +AZURE_OPENAI_DEPLOYMENT=your-deployment-name +AZURE_OPENAI_API_VERSION=2024-02-15-preview # optional ``` ## Usage ```bash -node scrape2json.js +node scrape2json.js [url2] [url3] ... ``` +Accepts multiple URLs (mix of PDFs and web pages). Processes each, then generates a meta summary. + ### Examples ```bash -# Extract from PDF +# Single document node scrape2json.js https://dserver.bundestag.de/brd/2025/0204-25.pdf -# Extract from webpage -node scrape2json.js https://www.bundestag.de/dokumente/textarchiv/2025/kw42-de-rente-1115416 +# Multiple documents +node scrape2json.js \ + https://dserver.bundestag.de/brd/2025/0204-25.pdf \ + https://www.bundestag.de/dokumente/textarchiv/2025/kw42-de-rente-1115416 ``` ## Output Returns JSON with: -- `date` — relevant implementation date -- `title` — document title -- `summary` — one paragraph summary in English +- `title` — generated title for the collection +- `summary` — meta-summary synthesizing all documents +- `items` — array of documents sorted by date, each with: + - `date` — relevant implementation date + - `title` — document title + - `summary` — one paragraph summary + - `url` — source URL diff --git a/scrape2json.js b/scrape2json.js index 8794c40..499cd6e 100644 --- a/scrape2json.js +++ b/scrape2json.js @@ -1,36 +1,109 @@ import 'dotenv/config'; -import { smartScraper } from 'scrapegraph-js'; import pdf2md from '@opendocsg/pdf2md'; import TurndownService from 'turndown'; -const apiKey = process.env.SCRAPEGRAPH_API_KEY; -if (!apiKey) { - console.error('Error: SCRAPEGRAPH_API_KEY environment variable is required'); - console.error('Copy .env.example to .env and add your API key'); +// Azure OpenAI configuration +const endpoint = process.env.AZURE_OPENAI_ENDPOINT; +const apiKey = process.env.AZURE_OPENAI_API_KEY; +const deployment = process.env.AZURE_OPENAI_DEPLOYMENT; +const apiVersion = process.env.AZURE_OPENAI_API_VERSION || '2024-02-15-preview'; +const temperature = parseFloat(process.env.AZURE_OPENAI_TEMPERATURE) || 0.2; +const throttleMs = parseInt(process.env.AZURE_OPENAI_THROTTLE_MS) || 5000; // 5 seconds default +const maxMarkdownLength = parseInt(process.env.MAX_MARKDOWN_LENGTH) || 50000; // ~50k chars default + +if (!endpoint || !apiKey || !deployment) { + console.error('Error: Azure OpenAI environment variables are required'); + console.error('Required: AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_API_KEY, AZURE_OPENAI_DEPLOYMENT'); + console.error('Optional: AZURE_OPENAI_API_VERSION (default: 2024-02-15-preview)'); + console.error('Optional: AZURE_OPENAI_TEMPERATURE (default: 0.2)'); + console.error('Optional: AZURE_OPENAI_THROTTLE_MS (default: 5000)'); + console.error('Optional: MAX_MARKDOWN_LENGTH (default: 50000)'); + console.error('Copy .env.example to .env and configure your Azure OpenAI settings'); process.exit(1); } -// Get URL from command line argument -const inputUrl = process.argv[2]; -if (!inputUrl) { - console.error('Usage: node scrape2json.js '); - console.error('Example: node scrape2json.js https://example.com/document.pdf'); - console.error(' node scrape2json.js https://example.com/page.html'); +/** + * Sleep for specified milliseconds + */ +function sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +// Get URLs from command line arguments +const inputUrls = process.argv.slice(2); +if (inputUrls.length === 0) { + console.error('Usage: node scrape2json.js [url2] [url3] ...'); + console.error('Example: node scrape2json.js https://example.com/doc.pdf https://example.com/page.html'); process.exit(1); } -const prompt = `Extract the following fields: +const itemSystemPrompt = `You extract structured data from documents. Be extremely concise. Return valid JSON only.`; + +const itemUserPrompt = `Extract from this document: + +- title: Document title +- date: Implementation date (YYYY-MM-DD) +- summary: 2-3 sentences maximum, key points only + +Return JSON with: date, title, summary + +Document: +`; + +const metaSystemPrompt = `You synthesize document collections. Be extremely concise. Return valid JSON only.`; + +const metaUserPrompt = `From these summaries, generate: + +- title: Brief collection title +- summary: 2-3 sentences synthesizing key themes + +Return JSON with: title, summary -- title -- date (relevant to the implementation) +Summaries: +`; -Generate a summary of no more than 1 paragraph. +/** + * Call Azure OpenAI chat completion API + */ +async function callAzureOpenAI(systemPrompt, userContent) { + const url = `${endpoint}/openai/deployments/${deployment}/chat/completions?api-version=${apiVersion}`; + + const response = await fetch(url, { + method: 'POST', + headers: { + 'api-key': apiKey, + 'Content-Type': 'application/json' + }, + body: JSON.stringify({ + messages: [ + { role: 'system', content: systemPrompt }, + { role: 'user', content: userContent } + ], + response_format: { type: 'json_object' }, + temperature + }) + }); -Return JSON with following fields: + if (!response.ok) { + const error = await response.text(); + throw new Error(`Azure OpenAI API error: ${response.status} - ${error}`); + } -- date -- title -- summary`; + const data = await response.json(); + const content = data.choices?.[0]?.message?.content; + + if (!content) { + throw new Error('No content in Azure OpenAI response'); + } + + // Throttle to avoid rate limits + if (throttleMs > 0) { + console.log(` Throttling ${throttleMs}ms...`); + await sleep(throttleMs); + } + + return JSON.parse(content); +} /** * Detect if a URL points to a PDF based on URL extension or content-type @@ -49,7 +122,6 @@ function isPdfUrl(url, contentType) { * Convert PDF buffer to Markdown */ async function pdfToMarkdown(buffer) { - console.log('Converting PDF to Markdown...'); const markdown = await pdf2md(buffer); return markdown; } @@ -58,7 +130,6 @@ async function pdfToMarkdown(buffer) { * Convert HTML to Markdown using Turndown */ function htmlToMarkdown(html) { - console.log('Converting HTML to Markdown...'); const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' @@ -79,47 +150,113 @@ async function urlToMarkdown(url) { const contentType = response.headers.get('content-type') || ''; if (isPdfUrl(url, contentType)) { - // Handle PDF + console.log(' Converting PDF to Markdown...'); const arrayBuffer = await response.arrayBuffer(); const pdfBuffer = Buffer.from(arrayBuffer); return await pdfToMarkdown(pdfBuffer); } else { - // Handle HTML + console.log(' Converting HTML to Markdown...'); const html = await response.text(); return htmlToMarkdown(html); } } /** - * Main function: fetch URL, convert to markdown, extract data with smartScraper + * Extract data from markdown using Azure OpenAI + */ +async function extractFromMarkdown(markdown, systemPrompt, userPromptPrefix) { + // Truncate markdown if too long (leave room for prompts) + const truncatedMarkdown = markdown.length > maxMarkdownLength + ? markdown.substring(0, maxMarkdownLength) + '\n\n[Content truncated...]' + : markdown; + + if (markdown.length > maxMarkdownLength) { + console.log(` Truncated from ${markdown.length} to ${maxMarkdownLength} chars`); + } + + return await callAzureOpenAI(systemPrompt, userPromptPrefix + truncatedMarkdown); +} + +/** + * Process a single URL: fetch, convert to markdown, extract data */ async function scrapeUrlToJson(url) { const markdown = await urlToMarkdown(url); - console.log(`Converted to ${markdown.length} characters of Markdown`); + console.log(` Converted to ${markdown.length} characters of Markdown`); + + console.log(' Extracting data...'); + const result = await extractFromMarkdown(markdown, itemSystemPrompt, itemUserPrompt); + + return { ...result, url }; +} + +/** + * Parse date string to Date object for sorting + */ +function parseDate(dateStr) { + if (!dateStr) return new Date(0); + const parsed = new Date(dateStr); + return isNaN(parsed.getTime()) ? new Date(0) : parsed; +} + +/** + * Generate meta summary from all items + */ +async function generateMetaSummary(items) { + const summariesMarkdown = items + .map((item, i) => `## Document ${i + 1}: ${item.title || 'Untitled'}\n\nDate: ${item.date || 'Unknown'}\n\n${item.summary || 'No summary'}`) + .join('\n\n---\n\n'); + + console.log('\nGenerating meta summary...'); + console.log('Summaries being sent:\n' + summariesMarkdown.substring(0, 500) + '...\n'); + + return await extractFromMarkdown(summariesMarkdown, metaSystemPrompt, metaUserPrompt); +} + +/** + * Main function: process multiple URLs and generate combined output + */ +async function scrapeMultipleUrls(urls) { + // Process each URL sequentially to avoid rate limits + const items = []; + for (let i = 0; i < urls.length; i++) { + console.log(`\n[${i + 1}/${urls.length}] Processing: ${urls[i]}`); + try { + const result = await scrapeUrlToJson(urls[i]); + items.push(result); + } catch (error) { + console.error(` Error processing ${urls[i]}: ${error.message}`); + items.push({ + url: urls[i], + title: null, + date: null, + summary: null, + error: error.message + }); + } + } + + // Sort items by date + const sortedItems = items.sort((a, b) => { + return parseDate(a.date).getTime() - parseDate(b.date).getTime(); + }); - // Pass markdown to smartScraper - console.log('Extracting data with smartScraper...'); - const result = await smartScraper( - apiKey, - null, // url (null since we're using websiteMarkdown) - prompt, - null, // schema - null, // numberOfScrolls - null, // totalPages - null, // cookies - {}, // options - false, // plain_text - false, // renderHeavyJs - false, // stealth - null, // websiteHtml - markdown // websiteMarkdown - ); + // Generate meta summary + const validItems = sortedItems.filter(item => !item.error); + const meta = validItems.length > 0 + ? await generateMetaSummary(validItems) + : { title: 'No valid documents', summary: 'All documents failed to process.' }; - return result; + return { + title: meta.title, + summary: meta.summary, + items: sortedItems + }; } try { - const result = await scrapeUrlToJson(inputUrl); + const result = await scrapeMultipleUrls(inputUrls); + console.log('\n' + '='.repeat(60)); console.log('Result:', JSON.stringify(result, null, 2)); } catch (error) { console.error('Error:', error);