From 72619d8973980417f68f40443fce270ee2bae73a Mon Sep 17 00:00:00 2001
From: jairomelo <jairoantoniomelo@gmail.com>
Date: Wed, 21 Jan 2026 16:34:22 -0800
Subject: [PATCH 1/4] Enhance web scraping functionality with error handling
 and fix data extraction logic

---
 episodes/a-real-website.md | 48 +++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 21 deletions(-)
diff --git a/episodes/a-real-website.md b/episodes/a-real-website.md
index 069234f..59416b5 100644
--- a/episodes/a-real-website.md
+++ b/episodes/a-real-website.md
@@ -56,11 +56,17 @@ from tqdm import tqdm
 
 # Getting the HTML from our desired URL as a text string
 url = 'https://carpentries.org/workshops/upcoming-workshops/'
-req = requests.get(url).text
+req = requests.get(url)
 
-# Cleaning and printing the string
-cleaned_req = re.sub(r'\s*\n\s*', '', req).strip()
-print(cleaned_req[0:1000])
+# Checking if the request was successful
+if req.status_code == 200:
+    req = req.text
+
+    # Cleaning and printing the string
+    cleaned_req = re.sub(r'\s*\n\s*', '', req).strip()
+    print(cleaned_req[0:1000])
+else:
+    print(f"Failed to retrieve the webpage. Status code: {req.status_code}")
 ```
 
 ```output
@@ -114,7 +120,7 @@ soup = BeautifulSoup(cleaned_req, 'html.parser')
 # Finding all third-level headers and doing a formatted print
 h3_by_tag = soup.find_all('h3')
 print("Number of h3 elements found: ", len(h3_by_tag))
-for n, h3 in enumerate(h3_by_tag):
+for n, h3 in enumerate(h3_by_tag, start=1):
     print(f"Workshop #{n} - {h3.get_text()}")
 ```
 
@@ -159,18 +165,18 @@ print(div_firsth3.prettify())
 
 Remember, the output shown here is probably different than yours, as the website is continuously updated.
 ```output
-<div class="p-8 mb-5 border" data-country="Puerto Rico" data-curriculum="Software Carpentry (Shell, Git, R for Reproducible Scientific Analysis)" data-meeting="In Person" data-program="Software Carpentry">
+<div class="p-8 mb-5 border" data-country="United States" data-curriculum="Library Carpentry (Intro to Data, Unix Shell, Git, and/or OpenRefine)" data-meeting="In Person" data-program="Library Carpentry">
  <div class="flex mb-4 -mx-2">
   <div class="flex items-center mx-2">
-   <img alt="" class="mx-1" src="/software.svg"/>
+   <img alt="" class="mx-1" src="/library.svg"/>
    <span class="text-[0.625rem] uppercase">
-    Software Carpentry
+    Library Carpentry
    </span>
   </div>
   <div class="flex items-center mx-2">
-   <img alt="" class="mr-1" height="20" src="/flags/pr.png" width="20"/>
+   <img alt="" class="mr-1" height="20" src="/flags/us.png" width="20"/>
    <span class="text-[0.625rem] uppercase">
-    Puerto Rico
+    United States
    </span>
   </div>
   <div class="flex items-center mx-2">
@@ -181,12 +187,12 @@ Remember, the output shown here is probably different than yours, as the website
   </div>
  </div>
  <h3 class="title text-base md:text-[1.75rem] leading-[2.125rem] font-semibold">
-  <a class="underline hover:text-blue-hover text-gray-dark" href="https://dept-ccom-uprrp.github.io/2025-06-04-uprrp-r/">
-   University of Puerto Rico
+  <a class="underline hover:text-blue-hover text-gray-dark" href="https://unt-carpentries.github.io/2026-01-22-unt/">
+   University of North Texas
   </a>
  </h3>
  <div class="mb-5 text-lg font-semibold text-gray-mid">
-  Software Carpentry (Shell, Git, R for Reproducible Scientific Analysis)
+  Library Carpentry (Intro to Data, Unix Shell, Git, and/or OpenRefine)
  </div>
  <div class="mb-2 text-xs">
   <strong class="font-bold">
@@ -194,7 +200,7 @@ Remember, the output shown here is probably different than yours, as the website
   </strong>
   :
   <span class="instructors">
-   Humberto Ortiz-Zuazaga, Airined Montes Mercado
+   Sarah Lynn Fisher, Maristella Feustle, Whitney Johnson-Freeman
   </span>
  </div>
  <div class="mb-4 text-xs">
@@ -203,11 +209,11 @@ Remember, the output shown here is probably different than yours, as the website
   </strong>
   :
   <span class="helpers">
-   Isabel Rivera, Diana Buitrago Escobar, Yabdiel Ramos Valerio
+   Marcia McIntosh, Trey Clark
   </span>
  </div>
  <div class="text-sm font-semibold text-gray-mid">
-  Jun 04 - Jun 10 2025
+  Jan 22 - Jan 22 2026
  </div>
 </div>
 ```
@@ -246,11 +252,11 @@ workshop_list = []
 for item in divs: 
     dict_workshop = {}
     dict_workshop['host'] = item.find('h3').get_text()
-    dict_workshop['link'] = div_firsth3.find('h3').find('a').get('href')
-    dict_workshop['curriculum'] = div_firsth3.get('data-curriculum')
-    dict_workshop['country'] = div_firsth3.get('data-country')
-    dict_workshop['format'] = div_firsth3.get('data-meeting')
-    dict_workshop['program'] = div_firsth3.get('data-program')
+    dict_workshop['link'] = item.find('h3').find('a').get('href') # get is used to access attribute values as a dictionary
+    dict_workshop['curriculum'] = item.get('data-curriculum')
+    dict_workshop['country'] = item.get('data-country')
+    dict_workshop['format'] = item.get('data-meeting')
+    dict_workshop['program'] = item.get('data-program')
     workshop_list.append(dict_workshop)
 
 # Transform list into a DataFrame

From 5d4d63cb3e05d38674ab1f2abb773e476d7c6467 Mon Sep 17 00:00:00 2001
From: jairomelo <jairoantoniomelo@gmail.com>
Date: Thu, 22 Jan 2026 13:38:47 -0800
Subject: [PATCH 2/4] Fix link extraction logic in workshop dictionary

---
 episodes/a-real-website.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/episodes/a-real-website.md b/episodes/a-real-website.md
index 59416b5..e2612ec 100644
--- a/episodes/a-real-website.md
+++ b/episodes/a-real-website.md
@@ -230,7 +230,7 @@ As shown in the previous episode, we can store all this information in a Python
 # Create an empty dictionary and fill it with the info we are interested in
 dict_workshop = {}
 dict_workshop['host'] = div_firsth3.find('h3').get_text()
-dict_workshop['link'] = div_firsth3.find('h3').find('a').get('href')
+dict_workshop['link'] = div_firsth3.find('a').get('href')
 dict_workshop['curriculum'] = div_firsth3.get('data-curriculum')
 dict_workshop['country'] = div_firsth3.get('data-country')
 dict_workshop['format'] = div_firsth3.get('data-meeting')

From 5ce4f61bc802b13b342eb81fe07ea683e772f8e8 Mon Sep 17 00:00:00 2001
From: jairomelo <jairoantoniomelo@gmail.com>
Date: Thu, 22 Jan 2026 13:40:44 -0800
Subject: [PATCH 3/4] Fix link extraction logic in workshop dictionary - is not
 necessary to iterate over h3 again to get the link

---
 episodes/a-real-website.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/episodes/a-real-website.md b/episodes/a-real-website.md
index e2612ec..88fd9d6 100644
--- a/episodes/a-real-website.md
+++ b/episodes/a-real-website.md
@@ -252,7 +252,7 @@ workshop_list = []
 for item in divs: 
     dict_workshop = {}
     dict_workshop['host'] = item.find('h3').get_text()
-    dict_workshop['link'] = item.find('h3').find('a').get('href') # get is used to access attribute values as a dictionary
+    dict_workshop['link'] = item.find('a').get('href') # get is used to access attribute values as a dictionary
     dict_workshop['curriculum'] = item.get('data-curriculum')
     dict_workshop['country'] = item.get('data-country')
     dict_workshop['format'] = item.get('data-meeting')
@@ -283,7 +283,7 @@ workshop_list = []
 while child_div is not None:
     dict_workshop = {}
     dict_workshop['host'] = child_div.find('h3').get_text()
-    dict_workshop['link'] = child_div.find('h3').find('a').get('href')
+    dict_workshop['link'] = child_div.find('a').get('href')
     dict_workshop['curriculum'] = child_div.get('data-curriculum')
     dict_workshop['country'] = child_div.get('data-country')
     dict_workshop['format'] = child_div.get('data-meeting')
@@ -325,7 +325,7 @@ workshop_list = []
 for item in divs_past:
     dict_workshop = {}
     dict_workshop['host'] = item.find('h3').get_text()
-    dict_workshop['link'] = item.find('h3').find('a').get('href')
+    dict_workshop['link'] = item.find('a').get('href')
     dict_workshop['curriculum'] = item.get('data-curriculum')
     dict_workshop['country'] = item.get('data-country')
     dict_workshop['format'] = item.get('data-meeting')
@@ -362,7 +362,7 @@ We only need to add three lines to our loop, and this is how it would look like.
 for item in divs: 
     dict_workshop = {}
     dict_workshop['host'] = item.find('h3').get_text()
-    dict_workshop['link'] = item.find('h3').find('a')['href']
+    dict_workshop['link'] = item.find('a')['href']
     dict_workshop['curriculum'] = item.get('data-curriculum')
     dict_workshop['country'] = item.get('data-country')
     dict_workshop['format'] = item.get('data-meeting')

From e992940ecc60cfe62c8f72b2af0f2caca3e5089d Mon Sep 17 00:00:00 2001
From: Jose Nino <jdninom@unal.edu.co>
Date: Mon, 26 Jan 2026 11:48:43 -0800
Subject: [PATCH 4/4] Undo episode outputs and removal of find(h3) for workshop
 links

---
 episodes/a-real-website.md | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/episodes/a-real-website.md b/episodes/a-real-website.md
index 88fd9d6..b224686 100644
--- a/episodes/a-real-website.md
+++ b/episodes/a-real-website.md
@@ -165,18 +165,18 @@ print(div_firsth3.prettify())
 
 Remember, the output shown here is probably different than yours, as the website is continuously updated.
 ```output
-<div class="p-8 mb-5 border" data-country="United States" data-curriculum="Library Carpentry (Intro to Data, Unix Shell, Git, and/or OpenRefine)" data-meeting="In Person" data-program="Library Carpentry">
+<div class="p-8 mb-5 border" data-country="Puerto Rico" data-curriculum="Software Carpentry (Shell, Git, R for Reproducible Scientific Analysis)" data-meeting="In Person" data-program="Software Carpentry">
  <div class="flex mb-4 -mx-2">
   <div class="flex items-center mx-2">
-   <img alt="" class="mx-1" src="/library.svg"/>
+   <img alt="" class="mx-1" src="/software.svg"/>
    <span class="text-[0.625rem] uppercase">
-    Library Carpentry
+    Software Carpentry
    </span>
   </div>
   <div class="flex items-center mx-2">
-   <img alt="" class="mr-1" height="20" src="/flags/us.png" width="20"/>
+   <img alt="" class="mr-1" height="20" src="/flags/pr.png" width="20"/>
    <span class="text-[0.625rem] uppercase">
-    United States
+    Puerto Rico
    </span>
   </div>
   <div class="flex items-center mx-2">
@@ -187,12 +187,12 @@ Remember, the output shown here is probably different than yours, as the website
   </div>
  </div>
  <h3 class="title text-base md:text-[1.75rem] leading-[2.125rem] font-semibold">
-  <a class="underline hover:text-blue-hover text-gray-dark" href="https://unt-carpentries.github.io/2026-01-22-unt/">
-   University of North Texas
+  <a class="underline hover:text-blue-hover text-gray-dark" href="https://dept-ccom-uprrp.github.io/2025-06-04-uprrp-r/">
+   University of Puerto Rico
   </a>
  </h3>
  <div class="mb-5 text-lg font-semibold text-gray-mid">
-  Library Carpentry (Intro to Data, Unix Shell, Git, and/or OpenRefine)
+  Software Carpentry (Shell, Git, R for Reproducible Scientific Analysis)
  </div>
  <div class="mb-2 text-xs">
   <strong class="font-bold">
@@ -200,7 +200,7 @@ Remember, the output shown here is probably different than yours, as the website
   </strong>
   :
   <span class="instructors">
-   Sarah Lynn Fisher, Maristella Feustle, Whitney Johnson-Freeman
+   Humberto Ortiz-Zuazaga, Airined Montes Mercado
   </span>
  </div>
  <div class="mb-4 text-xs">
@@ -209,11 +209,11 @@ Remember, the output shown here is probably different than yours, as the website
   </strong>
   :
   <span class="helpers">
-   Marcia McIntosh, Trey Clark
+   Isabel Rivera, Diana Buitrago Escobar, Yabdiel Ramos Valerio
   </span>
  </div>
  <div class="text-sm font-semibold text-gray-mid">
-  Jan 22 - Jan 22 2026
+  Jun 04 - Jun 10 2025
  </div>
 </div>
 ```
@@ -230,7 +230,7 @@ As shown in the previous episode, we can store all this information in a Python
 # Create an empty dictionary and fill it with the info we are interested in
 dict_workshop = {}
 dict_workshop['host'] = div_firsth3.find('h3').get_text()
-dict_workshop['link'] = div_firsth3.find('a').get('href')
+dict_workshop['link'] = div_firsth3.find('h3').find('a').get('href')
 dict_workshop['curriculum'] = div_firsth3.get('data-curriculum')
 dict_workshop['country'] = div_firsth3.get('data-country')
 dict_workshop['format'] = div_firsth3.get('data-meeting')
@@ -252,7 +252,7 @@ workshop_list = []
 for item in divs: 
     dict_workshop = {}
     dict_workshop['host'] = item.find('h3').get_text()
-    dict_workshop['link'] = item.find('a').get('href') # get is used to access attribute values as a dictionary
+    dict_workshop['link'] = item.find('h3').find('a').get('href') # get is used to access attribute values as a dictionary
     dict_workshop['curriculum'] = item.get('data-curriculum')
     dict_workshop['country'] = item.get('data-country')
     dict_workshop['format'] = item.get('data-meeting')
@@ -283,7 +283,7 @@ workshop_list = []
 while child_div is not None:
     dict_workshop = {}
     dict_workshop['host'] = child_div.find('h3').get_text()
-    dict_workshop['link'] = child_div.find('a').get('href')
+    dict_workshop['link'] = child_div.find('h3').find('a').get('href')
     dict_workshop['curriculum'] = child_div.get('data-curriculum')
     dict_workshop['country'] = child_div.get('data-country')
     dict_workshop['format'] = child_div.get('data-meeting')
@@ -325,7 +325,7 @@ workshop_list = []
 for item in divs_past:
     dict_workshop = {}
     dict_workshop['host'] = item.find('h3').get_text()
-    dict_workshop['link'] = item.find('a').get('href')
+    dict_workshop['link'] = item.find('h3').find('a').get('href')
     dict_workshop['curriculum'] = item.get('data-curriculum')
     dict_workshop['country'] = item.get('data-country')
     dict_workshop['format'] = item.get('data-meeting')
@@ -362,7 +362,7 @@ We only need to add three lines to our loop, and this is how it would look like.
 for item in divs: 
     dict_workshop = {}
     dict_workshop['host'] = item.find('h3').get_text()
-    dict_workshop['link'] = item.find('a')['href']
+    dict_workshop['link'] = item.find('h3').find('a')['href']
     dict_workshop['curriculum'] = item.get('data-curriculum')
     dict_workshop['country'] = item.get('data-country')
     dict_workshop['format'] = item.get('data-meeting')