Data Processing #321
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Data Processing | |
| # FORRT Data Processing Workflow | |
| # | |
| # Purpose: Automated data fetching and processing for FORRT website content | |
| # | |
| # Triggers: | |
| # - Weekly on Sundays at midnight UTC (scheduled) | |
| # - Manual trigger via GitHub Actions UI (workflow_dispatch) | |
| # | |
| # Data Sources Processed: | |
| # 1. Curated Resources (Python script) | |
| # 3. Google Analytics data (Python script) | |
| # 4. Contributor analysis (R script) - Monthly only | |
| # | |
| # Outputs: | |
| # - Updated JSON data files in data/ directory | |
| # - Static copies in static/data/ for client-side access | |
| # - Automated PRs for contributor analysis (monthly) | |
| # | |
| # The processed data is used throughout the Hugo website for dynamic content. | |
| on: | |
| schedule: | |
| - cron: '0 0 * * *' # Daily at Midnight UTC | |
| workflow_dispatch: | |
| inputs: | |
| skip_deploy: | |
| description: 'Skip triggering deploy after processing' | |
| required: false | |
| type: boolean | |
| default: false | |
| regenerate_glossary: | |
| description: 'Regenerate glossary files (only use when glossary sources are stable)' | |
| required: false | |
| type: boolean | |
| default: false | |
| jobs: | |
| process-data: | |
| name: Process Data | |
| runs-on: ubuntu-22.04 | |
| permissions: | |
| contents: write | |
| pull-requests: write | |
| env: | |
| PYTHON_VERSION: "3.11" | |
| steps: | |
| #================ | |
| # Repository Setup | |
| #================ | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| # Checkout the repository code to the runner environment | |
| #====================== | |
| # Workflow Configuration | |
| #====================== | |
| # Check if this is a monthly run (1st of month or manual trigger) | |
| #========================================================== | |
| - name: Check if monthly run | |
| id: monthly-run | |
| run: | | |
| CURRENT_DAY=$(date +%d) | |
| if [ "$CURRENT_DAY" != "01" ] && [ "${{ github.event_name }}" != "workflow_dispatch" ]; then | |
| echo "is_monthly=false" >> $GITHUB_OUTPUT | |
| echo "ℹ️ Skipping contributor analysis (not 1st of month and not manual trigger)" | |
| else | |
| echo "is_monthly=true" >> $GITHUB_OUTPUT | |
| echo "🔄 Monthly run detected - will run contributor analysis" | |
| fi | |
| #================= | |
| # Environment Setup | |
| #================= | |
| #======================================== | |
| # Configure Git with identity for commits | |
| #======================================== | |
| - name: Configure Git | |
| run: | | |
| git config --global user.email "mudaherarich@gmail.com" | |
| git config --global user.name "richarddushime" | |
| # Configure Git with the identity that will be used for commits for the monthly run | |
| #======================================== | |
| # Install Python 3.11 for running scripts | |
| #======================================== | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ env.PYTHON_VERSION }} | |
| cache: 'pip' | |
| #======================================== | |
| # Setup r2u for fast R package installation | |
| #======================================== | |
| - name: Setup r2u | |
| uses: eddelbuettel/github-actions/r2u-setup@master | |
| #======================================== | |
| # Install Pandoc for rendering R Markdown documents | |
| #======================================== | |
| - uses: r-lib/actions/setup-pandoc@v2 | |
| #======================================== | |
| # Install R packages for contributor analysis and visualization | |
| #======================================== | |
| - name: Install tenzing R dependencies | |
| run: Rscript -e 'install.packages(c("rmarkdown","ggplot2", "readxl", "dplyr", "googlesheets4", "stringr", "gridExtra", "glue", "tidygraph", "ggraph", "igraph", "visNetwork"))' | |
| #============================== | |
| # Contributor Analysis (Monthly) | |
| #============================== | |
| #======================================== | |
| # Generate contributor analysis reports and network visualizations | |
| #======================================== | |
| - name: Run Contributor Analysis | |
| if: steps.monthly-run.outputs.is_monthly == 'true' | |
| continue-on-error: true # Continue even if this step fails | |
| run: | | |
| echo "🚀 Running Contributor Analysis..." | |
| # Clean old files from content/contributor-analysis and partials | |
| rm -rf content/contributor-analysis/*.png content/contributor-analysis/*.html content/contributor-analysis/htmlwidgets_libs | |
| rm -f layouts/partials/network-graph.html | |
| # Run index.Rmd to generate contributor analysis content and plots | |
| echo "📊 Rendering contributor analysis..." | |
| Rscript -e "rmarkdown::render('content/contributor-analysis/index.Rmd')" | |
| # Run network-graph.Rmd to generate interactive network visualization | |
| echo "🕸️ Rendering network visualization..." | |
| Rscript -e "rmarkdown::render('content/contributor-analysis/network-graph.Rmd')" | |
| # Move generated HTML file to layouts/partials | |
| echo "📁 Moving network graph to partials..." | |
| mv content/contributor-analysis/network-graph.html layouts/partials/ | |
| # Clean up HTML artifacts from index.md if any | |
| sed -i.bak -e '/^```{=html}$/d' -e '/^```$/d' content/contributor-analysis/index.md && rm content/contributor-analysis/index.md.bak | |
| echo "✅ Contributor analysis complete" | |
| #======================= | |
| # Tenzing Data Processing | |
| #======================= | |
| #======================================== | |
| # Install Python packages for data processing scripts | |
| #======================================== | |
| - name: Install Python dependencies | |
| run: python3 -m pip install -r ./requirements.txt | |
| #======================================== | |
| # Process contributor data using Tenzing script | |
| #======================================== | |
| - name: Run Tenzing script | |
| continue-on-error: true # Continue even if this step fails | |
| run: python3 scripts/forrt_contribs/tenzing.py | |
| #======================================== | |
| # Check for Tenzing failures and create issue if needed | |
| #======================================== | |
| - name: Check Tenzing failures and create issue | |
| if: always() # Run even if previous step failed | |
| continue-on-error: true # Don't fail the workflow if issue creation fails | |
| run: python3 scripts/forrt_contribs/create_failure_issue.py | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| #======================================== | |
| # Process and organize curated resources data | |
| #======================================== | |
| - name: Run Curated Resources script | |
| continue-on-error: true # Continue even if this step fails | |
| run: python3 content/resources/resource.py | |
| # Execute the curated resources script that processes and organizes resource data | |
| #======================================== | |
| # Move Tenzing output to content directory and validate | |
| #======================================== | |
| - name: Move and validate Tenzing output | |
| continue-on-error: true # Continue even if this step fails | |
| run: | | |
| mv scripts/forrt_contribs/tenzing.md content/contributors/tenzing.md | |
| if [ ! -f content/contributors/tenzing.md ]; then | |
| echo "tenzing.md not found" | |
| exit 1 | |
| fi | |
| #======================================== | |
| # Validate that curated resources files available under content/curated_resources | |
| #======================================== | |
| - name: Validate curated resources | |
| continue-on-error: true # Continue even if this step fails | |
| run: | | |
| for file in content/curated_resources/*; do | |
| if [ ! -f "$file" ]; then | |
| echo "Non-markdown file found: $file" | |
| exit 1 | |
| fi | |
| done | |
| #======================================== | |
| # Process and generate glossary files | |
| #======================================== | |
| - name: Run Glossary Generation script | |
| if: github.event.inputs.regenerate_glossary == 'true' | |
| continue-on-error: true # Continue even if this step fails | |
| run: python3 content/glossary/_create_glossaries.py | |
| # Execute the glossary script that generates glossary markdown files | |
| #======================================== | |
| # Download Google Analytics data and validate | |
| #======================================== | |
| - name: Download GA Data | |
| continue-on-error: true # Continue even if this step fails | |
| env: | |
| GA_API_CREDENTIALS: ${{ secrets.GA_API_CREDENTIALS }} | |
| GA_PROPERTY_ID: ${{ secrets.GA_PROPERTY_ID }} | |
| run: | | |
| if [ -z "$GA_API_CREDENTIALS" ] || [ -z "$GA_PROPERTY_ID" ]; then | |
| echo "❌ GA credentials not set" | |
| exit 1 | |
| fi | |
| rm -f data/ga_data.json | |
| rm -rf data/ga_data/ | |
| python scripts/download_ga_data.py | |
| if [ -f "data/ga_data.json" ]; then | |
| echo "✅ GA data file created successfully" | |
| echo "File size: $(wc -c < data/ga_data.json) bytes" | |
| # Quick validation of data structure | |
| python3 -c "import json; data = json.load(open('data/ga_data.json')); print('✅ GA data:', len(data.get('regions', [])), 'countries,', len(data.get('top_pages', [])), 'pages')" | |
| else | |
| echo "❌ GA data file was not created" | |
| exit 1 | |
| fi | |
| #======================================================= | |
| # Create a pull request for GA data updates on monthly runs | |
| #======================================================= | |
| - name: Create PR for GA data update | |
| if: github.event_name != 'pull_request' | |
| continue-on-error: true # Continue even if this step fails | |
| run: | | |
| echo "=== Creating PR for GA data update ===" | |
| # Check if it's the first day of the month OR manually triggered | |
| CURRENT_DAY=$(date +%d) | |
| if [ "$CURRENT_DAY" != "01" ] && [ "${{ github.event_name }}" != "workflow_dispatch" ]; then | |
| echo "ℹ️ Skipping PR creation (not 1st of month and not manual trigger)" | |
| exit 0 | |
| fi | |
| BRANCH_NAME="ga-data-update-$(date +%Y%m%d-%H%M%S)" | |
| git fetch origin master | |
| git checkout master | |
| # Delete local branch if it exists | |
| git branch -D "$BRANCH_NAME" 2>/dev/null || true | |
| git checkout -b "$BRANCH_NAME" | |
| # Verify we're on the correct branch | |
| CURRENT_BRANCH=$(git branch --show-current) | |
| if [ "$CURRENT_BRANCH" != "$BRANCH_NAME" ]; then | |
| echo "❌ Failed to create branch $BRANCH_NAME, currently on $CURRENT_BRANCH" | |
| exit 1 | |
| fi | |
| echo "✅ Created and switched to branch: $BRANCH_NAME" | |
| # Add and commit the GA data file | |
| echo "Adding GA data file..." | |
| git add data/ga_data.json | |
| git commit -m "Update GA data - $(date -u +'%Y-%m-%d %H:%M:%S UTC')" | |
| if ! git push origin "$BRANCH_NAME" --force-with-lease; then | |
| git push origin "$BRANCH_NAME" | |
| fi | |
| gh pr create \ | |
| --title "📊 Monthly GA Data Update - $(date '+%B %Y')" \ | |
| --body "Automated monthly Google Analytics data update. Generated on $(date -u +'%Y-%m-%d %H:%M:%S UTC'). Files changed: data/ga_data.json" \ | |
| --base master \ | |
| --head "$BRANCH_NAME" \ | |
| --label "ga-data,monthly-update" | |
| echo "✅ PR created for GA data update" | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.FORRT_PAT }} | |
| GH_TOKEN: ${{ secrets.FORRT_PAT }} | |
| #======================= | |
| # Google Scholar Citations | |
| #======================================== | |
| # Execute Google Scholar citation tracking script | |
| #======================================== | |
| - name: Run Google Scholar script | |
| continue-on-error: true | |
| run: python3 scripts/gs-cite/google_scholar.py | |
| env: | |
| SERPAPI: ${{ secrets.SERPAPI }} | |
| #============== | |
| # Artifact Upload | |
| #============== | |
| #======================================== | |
| # Upload all processed data files as artifact | |
| #======================================== | |
| - name: Upload data artifact | |
| id: upload-artifact | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: data-artifact | |
| path: | | |
| content/contributors/tenzing.md | |
| content/curated_resources/ | |
| content/glossary/ | |
| data/ | |
| content/contributor-analysis/ | |
| content/publications/citation_chart.webp | |
| retention-days: 7 | |
| #======================================== | |
| # Commit generated files to build-resources branch (via worktree) | |
| #======================================== | |
| - name: Commit to build-resources branch | |
| if: github.event_name != 'pull_request' | |
| continue-on-error: true | |
| run: | | |
| echo "📝 Committing generated files to build-resources branch via worktree..." | |
| set -e | |
| WORKTREE_DIR="/tmp/build-resources-worktree" | |
| # Store generated files in temp location | |
| mkdir -p /tmp/generated-resources | |
| cp -r content/curated_resources /tmp/generated-resources/ | |
| cp content/contributors/tenzing.md /tmp/generated-resources/ || true | |
| cp data/ga_data.json /tmp/generated-resources/ga_data.json || true | |
| # Copy additional generated files | |
| mkdir -p /tmp/generated-resources/contributor-analysis | |
| cp -r content/contributor-analysis/* /tmp/generated-resources/contributor-analysis/ || true | |
| mkdir -p /tmp/generated-resources/publications | |
| cp content/publications/citation_chart.webp /tmp/generated-resources/publications/ || true | |
| if [ -f data/summaries.json ]; then | |
| cp data/summaries.json /tmp/generated-resources/summaries.json | |
| fi | |
| if [ "${{ github.event.inputs.regenerate_glossary }}" = "true" ]; then | |
| cp -r content/glossary /tmp/generated-resources/ | |
| fi | |
| # Prepare worktree for build-resources | |
| git fetch origin | |
| rm -rf "$WORKTREE_DIR" | |
| if git ls-remote --exit-code origin build-resources >/dev/null 2>&1; then | |
| echo "✓ build-resources branch exists, creating/updating worktree" | |
| git worktree add -B build-resources "$WORKTREE_DIR" origin/build-resources | |
| else | |
| echo "✓ build-resources does not exist, creating from master" | |
| git worktree add -b build-resources "$WORKTREE_DIR" origin/master | |
| fi | |
| # Apply updates inside the worktree | |
| pushd "$WORKTREE_DIR" | |
| # Ensure target directories exist | |
| mkdir -p content/curated_resources content/contributors data content/contributor-analysis content/publications | |
| # Remove old generated resource files (but keep _index.md) | |
| find content/curated_resources -type f ! -name '_index.md' -delete 2>/dev/null || true | |
| # We also want to clean up old contributor analysis files to avoid stale data | |
| rm -rf content/contributor-analysis/* 2>/dev/null || true | |
| # Copy newly generated files | |
| cp -r /tmp/generated-resources/curated_resources/* content/curated_resources/ || true | |
| if [ -f /tmp/generated-resources/tenzing.md ]; then | |
| cp /tmp/generated-resources/tenzing.md content/contributors/ | |
| fi | |
| if [ -f /tmp/generated-resources/ga_data.json ]; then | |
| cp /tmp/generated-resources/ga_data.json data/ga_data.json | |
| fi | |
| if [ -f /tmp/generated-resources/summaries.json ]; then | |
| cp /tmp/generated-resources/summaries.json data/summaries.json | |
| fi | |
| # Copy contributor analysis and citation chart | |
| cp -r /tmp/generated-resources/contributor-analysis/* content/contributor-analysis/ || true | |
| if [ -f /tmp/generated-resources/publications/citation_chart.webp ]; then | |
| cp /tmp/generated-resources/publications/citation_chart.webp content/publications/ | |
| fi | |
| # Copy glossary files only if regenerated (preserving directory structure) | |
| if [ "${{ github.event.inputs.regenerate_glossary }}" = "true" ] && [ -d /tmp/generated-resources/glossary ]; then | |
| echo "✓ Updating glossary files in build-resources worktree" | |
| mkdir -p content/glossary | |
| find content/glossary -type f ! -name '_index.md' ! -name '_create_glossaries.py' -delete 2>/dev/null || true | |
| rsync -av --exclude='_index.md' --exclude='_create_glossaries.py' /tmp/generated-resources/glossary/ content/glossary/ | |
| fi | |
| # Check if there are any changes to commit | |
| if git diff --quiet && git diff --cached --quiet; then | |
| echo "ℹ️ No changes to commit" | |
| else | |
| echo "✓ Changes detected, committing..." | |
| # Add all potential files | |
| git add content/curated_resources/ 2>/dev/null || true | |
| git add content/contributors/tenzing.md 2>/dev/null || true | |
| git add data/ga_data.json 2>/dev/null || true | |
| git add data/summaries.json 2>/dev/null || true | |
| git add content/contributor-analysis/ 2>/dev/null || true | |
| git add content/publications/citation_chart.webp 2>/dev/null || true | |
| if [ "${{ github.event.inputs.regenerate_glossary }}" = "true" ]; then | |
| git add content/glossary/ 2>/dev/null || true | |
| fi | |
| git commit -m "Update generated resources and data - $(date -u +'%Y-%m-%d %H:%M:%S UTC')" || echo "Nothing to commit" | |
| # Push to build-resources with retry logic | |
| MAX_RETRIES=3 | |
| RETRY_COUNT=0 | |
| while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do | |
| # Using force-with-lease to be safer, but effectively force pushing since we are overwriting | |
| if git push -u origin build-resources --force-with-lease; then | |
| echo "✅ Successfully pushed to build-resources branch" | |
| break | |
| else | |
| RETRY_COUNT=$((RETRY_COUNT + 1)) | |
| if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then | |
| echo "⚠️ Push failed, retrying ($RETRY_COUNT/$MAX_RETRIES)..." | |
| sleep 2 | |
| git pull --rebase || true | |
| else | |
| echo "❌ Push failed after $MAX_RETRIES attempts" | |
| exit 1 | |
| fi | |
| fi | |
| done | |
| fi | |
| popd | |
| # Clean up worktree | |
| git worktree remove "$WORKTREE_DIR" --force || true | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.FORRT_PAT }} | |
| #================ | |
| # Trigger Deploy | |
| #================ | |
| - name: Trigger Deploy | |
| if: inputs.skip_deploy != true | |
| run: | | |
| echo "🚀 Triggering deployment workflow..." | |
| curl -s -X POST \ | |
| -H "Authorization: Bearer ${{ secrets.FORRT_PAT }}" \ | |
| -H "Accept: application/vnd.github+json" \ | |
| "https://api.github.com/repos/${{ github.repository }}/dispatches" \ | |
| -d '{"event_type": "data-update"}' | |
| #==================== | |
| # Trigger Deployment | |
| #==================== | |
| #======================================== | |
| # Trigger the deploy workflow to publish updated data | |
| #======================================== | |
| - name: Trigger deployment | |
| if: github.event_name != 'pull_request' | |
| run: | | |
| echo "🚀 Triggering deployment workflow..." | |
| gh api repos/${{ github.repository }}/dispatches \ | |
| -f event_type=data-update \ | |
| -F client_payload[data_update]=true | |
| echo "✅ Deployment triggered successfully" | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} |