Skip to content

Data Processing

Data Processing #321

name: Data Processing
# FORRT Data Processing Workflow
#
# Purpose: Automated data fetching and processing for FORRT website content
#
# Triggers:
# - Weekly on Sundays at midnight UTC (scheduled)
# - Manual trigger via GitHub Actions UI (workflow_dispatch)
#
# Data Sources Processed:
# 1. Curated Resources (Python script)
# 3. Google Analytics data (Python script)
# 4. Contributor analysis (R script) - Monthly only
#
# Outputs:
# - Updated JSON data files in data/ directory
# - Static copies in static/data/ for client-side access
# - Automated PRs for contributor analysis (monthly)
#
# The processed data is used throughout the Hugo website for dynamic content.
on:
schedule:
- cron: '0 0 * * *' # Daily at Midnight UTC
workflow_dispatch:
inputs:
skip_deploy:
description: 'Skip triggering deploy after processing'
required: false
type: boolean
default: false
regenerate_glossary:
description: 'Regenerate glossary files (only use when glossary sources are stable)'
required: false
type: boolean
default: false
jobs:
process-data:
name: Process Data
runs-on: ubuntu-22.04
permissions:
contents: write
pull-requests: write
env:
PYTHON_VERSION: "3.11"
steps:
#================
# Repository Setup
#================
- name: Checkout repository
uses: actions/checkout@v4
# Checkout the repository code to the runner environment
#======================
# Workflow Configuration
#======================
# Check if this is a monthly run (1st of month or manual trigger)
#==========================================================
- name: Check if monthly run
id: monthly-run
run: |
CURRENT_DAY=$(date +%d)
if [ "$CURRENT_DAY" != "01" ] && [ "${{ github.event_name }}" != "workflow_dispatch" ]; then
echo "is_monthly=false" >> $GITHUB_OUTPUT
echo "ℹ️ Skipping contributor analysis (not 1st of month and not manual trigger)"
else
echo "is_monthly=true" >> $GITHUB_OUTPUT
echo "🔄 Monthly run detected - will run contributor analysis"
fi
#=================
# Environment Setup
#=================
#========================================
# Configure Git with identity for commits
#========================================
- name: Configure Git
run: |
git config --global user.email "mudaherarich@gmail.com"
git config --global user.name "richarddushime"
# Configure Git with the identity that will be used for commits for the monthly run
#========================================
# Install Python 3.11 for running scripts
#========================================
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
#========================================
# Setup r2u for fast R package installation
#========================================
- name: Setup r2u
uses: eddelbuettel/github-actions/r2u-setup@master
#========================================
# Install Pandoc for rendering R Markdown documents
#========================================
- uses: r-lib/actions/setup-pandoc@v2
#========================================
# Install R packages for contributor analysis and visualization
#========================================
- name: Install tenzing R dependencies
run: Rscript -e 'install.packages(c("rmarkdown","ggplot2", "readxl", "dplyr", "googlesheets4", "stringr", "gridExtra", "glue", "tidygraph", "ggraph", "igraph", "visNetwork"))'
#==============================
# Contributor Analysis (Monthly)
#==============================
#========================================
# Generate contributor analysis reports and network visualizations
#========================================
- name: Run Contributor Analysis
if: steps.monthly-run.outputs.is_monthly == 'true'
continue-on-error: true # Continue even if this step fails
run: |
echo "🚀 Running Contributor Analysis..."
# Clean old files from content/contributor-analysis and partials
rm -rf content/contributor-analysis/*.png content/contributor-analysis/*.html content/contributor-analysis/htmlwidgets_libs
rm -f layouts/partials/network-graph.html
# Run index.Rmd to generate contributor analysis content and plots
echo "📊 Rendering contributor analysis..."
Rscript -e "rmarkdown::render('content/contributor-analysis/index.Rmd')"
# Run network-graph.Rmd to generate interactive network visualization
echo "🕸️ Rendering network visualization..."
Rscript -e "rmarkdown::render('content/contributor-analysis/network-graph.Rmd')"
# Move generated HTML file to layouts/partials
echo "📁 Moving network graph to partials..."
mv content/contributor-analysis/network-graph.html layouts/partials/
# Clean up HTML artifacts from index.md if any
sed -i.bak -e '/^```{=html}$/d' -e '/^```$/d' content/contributor-analysis/index.md && rm content/contributor-analysis/index.md.bak
echo "✅ Contributor analysis complete"
#=======================
# Tenzing Data Processing
#=======================
#========================================
# Install Python packages for data processing scripts
#========================================
- name: Install Python dependencies
run: python3 -m pip install -r ./requirements.txt
#========================================
# Process contributor data using Tenzing script
#========================================
- name: Run Tenzing script
continue-on-error: true # Continue even if this step fails
run: python3 scripts/forrt_contribs/tenzing.py
#========================================
# Check for Tenzing failures and create issue if needed
#========================================
- name: Check Tenzing failures and create issue
if: always() # Run even if previous step failed
continue-on-error: true # Don't fail the workflow if issue creation fails
run: python3 scripts/forrt_contribs/create_failure_issue.py
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
#========================================
# Process and organize curated resources data
#========================================
- name: Run Curated Resources script
continue-on-error: true # Continue even if this step fails
run: python3 content/resources/resource.py
# Execute the curated resources script that processes and organizes resource data
#========================================
# Move Tenzing output to content directory and validate
#========================================
- name: Move and validate Tenzing output
continue-on-error: true # Continue even if this step fails
run: |
mv scripts/forrt_contribs/tenzing.md content/contributors/tenzing.md
if [ ! -f content/contributors/tenzing.md ]; then
echo "tenzing.md not found"
exit 1
fi
#========================================
# Validate that curated resources files available under content/curated_resources
#========================================
- name: Validate curated resources
continue-on-error: true # Continue even if this step fails
run: |
for file in content/curated_resources/*; do
if [ ! -f "$file" ]; then
echo "Non-markdown file found: $file"
exit 1
fi
done
#========================================
# Process and generate glossary files
#========================================
- name: Run Glossary Generation script
if: github.event.inputs.regenerate_glossary == 'true'
continue-on-error: true # Continue even if this step fails
run: python3 content/glossary/_create_glossaries.py
# Execute the glossary script that generates glossary markdown files
#========================================
# Download Google Analytics data and validate
#========================================
- name: Download GA Data
continue-on-error: true # Continue even if this step fails
env:
GA_API_CREDENTIALS: ${{ secrets.GA_API_CREDENTIALS }}
GA_PROPERTY_ID: ${{ secrets.GA_PROPERTY_ID }}
run: |
if [ -z "$GA_API_CREDENTIALS" ] || [ -z "$GA_PROPERTY_ID" ]; then
echo "❌ GA credentials not set"
exit 1
fi
rm -f data/ga_data.json
rm -rf data/ga_data/
python scripts/download_ga_data.py
if [ -f "data/ga_data.json" ]; then
echo "✅ GA data file created successfully"
echo "File size: $(wc -c < data/ga_data.json) bytes"
# Quick validation of data structure
python3 -c "import json; data = json.load(open('data/ga_data.json')); print('✅ GA data:', len(data.get('regions', [])), 'countries,', len(data.get('top_pages', [])), 'pages')"
else
echo "❌ GA data file was not created"
exit 1
fi
#=======================================================
# Create a pull request for GA data updates on monthly runs
#=======================================================
- name: Create PR for GA data update
if: github.event_name != 'pull_request'
continue-on-error: true # Continue even if this step fails
run: |
echo "=== Creating PR for GA data update ==="
# Check if it's the first day of the month OR manually triggered
CURRENT_DAY=$(date +%d)
if [ "$CURRENT_DAY" != "01" ] && [ "${{ github.event_name }}" != "workflow_dispatch" ]; then
echo "ℹ️ Skipping PR creation (not 1st of month and not manual trigger)"
exit 0
fi
BRANCH_NAME="ga-data-update-$(date +%Y%m%d-%H%M%S)"
git fetch origin master
git checkout master
# Delete local branch if it exists
git branch -D "$BRANCH_NAME" 2>/dev/null || true
git checkout -b "$BRANCH_NAME"
# Verify we're on the correct branch
CURRENT_BRANCH=$(git branch --show-current)
if [ "$CURRENT_BRANCH" != "$BRANCH_NAME" ]; then
echo "❌ Failed to create branch $BRANCH_NAME, currently on $CURRENT_BRANCH"
exit 1
fi
echo "✅ Created and switched to branch: $BRANCH_NAME"
# Add and commit the GA data file
echo "Adding GA data file..."
git add data/ga_data.json
git commit -m "Update GA data - $(date -u +'%Y-%m-%d %H:%M:%S UTC')"
if ! git push origin "$BRANCH_NAME" --force-with-lease; then
git push origin "$BRANCH_NAME"
fi
gh pr create \
--title "📊 Monthly GA Data Update - $(date '+%B %Y')" \
--body "Automated monthly Google Analytics data update. Generated on $(date -u +'%Y-%m-%d %H:%M:%S UTC'). Files changed: data/ga_data.json" \
--base master \
--head "$BRANCH_NAME" \
--label "ga-data,monthly-update"
echo "✅ PR created for GA data update"
env:
GITHUB_TOKEN: ${{ secrets.FORRT_PAT }}
GH_TOKEN: ${{ secrets.FORRT_PAT }}
#=======================
# Google Scholar Citations
#========================================
# Execute Google Scholar citation tracking script
#========================================
- name: Run Google Scholar script
continue-on-error: true
run: python3 scripts/gs-cite/google_scholar.py
env:
SERPAPI: ${{ secrets.SERPAPI }}
#==============
# Artifact Upload
#==============
#========================================
# Upload all processed data files as artifact
#========================================
- name: Upload data artifact
id: upload-artifact
uses: actions/upload-artifact@v4
with:
name: data-artifact
path: |
content/contributors/tenzing.md
content/curated_resources/
content/glossary/
data/
content/contributor-analysis/
content/publications/citation_chart.webp
retention-days: 7
#========================================
# Commit generated files to build-resources branch (via worktree)
#========================================
- name: Commit to build-resources branch
if: github.event_name != 'pull_request'
continue-on-error: true
run: |
echo "📝 Committing generated files to build-resources branch via worktree..."
set -e
WORKTREE_DIR="/tmp/build-resources-worktree"
# Store generated files in temp location
mkdir -p /tmp/generated-resources
cp -r content/curated_resources /tmp/generated-resources/
cp content/contributors/tenzing.md /tmp/generated-resources/ || true
cp data/ga_data.json /tmp/generated-resources/ga_data.json || true
# Copy additional generated files
mkdir -p /tmp/generated-resources/contributor-analysis
cp -r content/contributor-analysis/* /tmp/generated-resources/contributor-analysis/ || true
mkdir -p /tmp/generated-resources/publications
cp content/publications/citation_chart.webp /tmp/generated-resources/publications/ || true
if [ -f data/summaries.json ]; then
cp data/summaries.json /tmp/generated-resources/summaries.json
fi
if [ "${{ github.event.inputs.regenerate_glossary }}" = "true" ]; then
cp -r content/glossary /tmp/generated-resources/
fi
# Prepare worktree for build-resources
git fetch origin
rm -rf "$WORKTREE_DIR"
if git ls-remote --exit-code origin build-resources >/dev/null 2>&1; then
echo "✓ build-resources branch exists, creating/updating worktree"
git worktree add -B build-resources "$WORKTREE_DIR" origin/build-resources
else
echo "✓ build-resources does not exist, creating from master"
git worktree add -b build-resources "$WORKTREE_DIR" origin/master
fi
# Apply updates inside the worktree
pushd "$WORKTREE_DIR"
# Ensure target directories exist
mkdir -p content/curated_resources content/contributors data content/contributor-analysis content/publications
# Remove old generated resource files (but keep _index.md)
find content/curated_resources -type f ! -name '_index.md' -delete 2>/dev/null || true
# We also want to clean up old contributor analysis files to avoid stale data
rm -rf content/contributor-analysis/* 2>/dev/null || true
# Copy newly generated files
cp -r /tmp/generated-resources/curated_resources/* content/curated_resources/ || true
if [ -f /tmp/generated-resources/tenzing.md ]; then
cp /tmp/generated-resources/tenzing.md content/contributors/
fi
if [ -f /tmp/generated-resources/ga_data.json ]; then
cp /tmp/generated-resources/ga_data.json data/ga_data.json
fi
if [ -f /tmp/generated-resources/summaries.json ]; then
cp /tmp/generated-resources/summaries.json data/summaries.json
fi
# Copy contributor analysis and citation chart
cp -r /tmp/generated-resources/contributor-analysis/* content/contributor-analysis/ || true
if [ -f /tmp/generated-resources/publications/citation_chart.webp ]; then
cp /tmp/generated-resources/publications/citation_chart.webp content/publications/
fi
# Copy glossary files only if regenerated (preserving directory structure)
if [ "${{ github.event.inputs.regenerate_glossary }}" = "true" ] && [ -d /tmp/generated-resources/glossary ]; then
echo "✓ Updating glossary files in build-resources worktree"
mkdir -p content/glossary
find content/glossary -type f ! -name '_index.md' ! -name '_create_glossaries.py' -delete 2>/dev/null || true
rsync -av --exclude='_index.md' --exclude='_create_glossaries.py' /tmp/generated-resources/glossary/ content/glossary/
fi
# Check if there are any changes to commit
if git diff --quiet && git diff --cached --quiet; then
echo "ℹ️ No changes to commit"
else
echo "✓ Changes detected, committing..."
# Add all potential files
git add content/curated_resources/ 2>/dev/null || true
git add content/contributors/tenzing.md 2>/dev/null || true
git add data/ga_data.json 2>/dev/null || true
git add data/summaries.json 2>/dev/null || true
git add content/contributor-analysis/ 2>/dev/null || true
git add content/publications/citation_chart.webp 2>/dev/null || true
if [ "${{ github.event.inputs.regenerate_glossary }}" = "true" ]; then
git add content/glossary/ 2>/dev/null || true
fi
git commit -m "Update generated resources and data - $(date -u +'%Y-%m-%d %H:%M:%S UTC')" || echo "Nothing to commit"
# Push to build-resources with retry logic
MAX_RETRIES=3
RETRY_COUNT=0
while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
# Using force-with-lease to be safer, but effectively force pushing since we are overwriting
if git push -u origin build-resources --force-with-lease; then
echo "✅ Successfully pushed to build-resources branch"
break
else
RETRY_COUNT=$((RETRY_COUNT + 1))
if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
echo "⚠️ Push failed, retrying ($RETRY_COUNT/$MAX_RETRIES)..."
sleep 2
git pull --rebase || true
else
echo "❌ Push failed after $MAX_RETRIES attempts"
exit 1
fi
fi
done
fi
popd
# Clean up worktree
git worktree remove "$WORKTREE_DIR" --force || true
env:
GITHUB_TOKEN: ${{ secrets.FORRT_PAT }}
#================
# Trigger Deploy
#================
- name: Trigger Deploy
if: inputs.skip_deploy != true
run: |
echo "🚀 Triggering deployment workflow..."
curl -s -X POST \
-H "Authorization: Bearer ${{ secrets.FORRT_PAT }}" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/dispatches" \
-d '{"event_type": "data-update"}'
#====================
# Trigger Deployment
#====================
#========================================
# Trigger the deploy workflow to publish updated data
#========================================
- name: Trigger deployment
if: github.event_name != 'pull_request'
run: |
echo "🚀 Triggering deployment workflow..."
gh api repos/${{ github.repository }}/dispatches \
-f event_type=data-update \
-F client_payload[data_update]=true
echo "✅ Deployment triggered successfully"
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}