From 72619d8973980417f68f40443fce270ee2bae73a Mon Sep 17 00:00:00 2001 From: jairomelo Date: Wed, 21 Jan 2026 16:34:22 -0800 Subject: [PATCH 1/4] Enhance web scraping functionality with error handling and fix data extraction logic --- episodes/a-real-website.md | 48 +++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/episodes/a-real-website.md b/episodes/a-real-website.md index 069234f..59416b5 100644 --- a/episodes/a-real-website.md +++ b/episodes/a-real-website.md @@ -56,11 +56,17 @@ from tqdm import tqdm # Getting the HTML from our desired URL as a text string url = 'https://carpentries.org/workshops/upcoming-workshops/' -req = requests.get(url).text +req = requests.get(url) -# Cleaning and printing the string -cleaned_req = re.sub(r'\s*\n\s*', '', req).strip() -print(cleaned_req[0:1000]) +# Checking if the request was successful +if req.status_code == 200: + req = req.text + + # Cleaning and printing the string + cleaned_req = re.sub(r'\s*\n\s*', '', req).strip() + print(cleaned_req[0:1000]) +else: + print(f"Failed to retrieve the webpage. Status code: {req.status_code}") ``` ```output @@ -114,7 +120,7 @@ soup = BeautifulSoup(cleaned_req, 'html.parser') # Finding all third-level headers and doing a formatted print h3_by_tag = soup.find_all('h3') print("Number of h3 elements found: ", len(h3_by_tag)) -for n, h3 in enumerate(h3_by_tag): +for n, h3 in enumerate(h3_by_tag, start=1): print(f"Workshop #{n} - {h3.get_text()}") ``` @@ -159,18 +165,18 @@ print(div_firsth3.prettify()) Remember, the output shown here is probably different than yours, as the website is continuously updated. ```output -
+
- + - Software Carpentry + Library Carpentry
- + - Puerto Rico + United States
@@ -181,12 +187,12 @@ Remember, the output shown here is probably different than yours, as the website

- - University of Puerto Rico + + University of North Texas

- Software Carpentry (Shell, Git, R for Reproducible Scientific Analysis) + Library Carpentry (Intro to Data, Unix Shell, Git, and/or OpenRefine)
@@ -194,7 +200,7 @@ Remember, the output shown here is probably different than yours, as the website : - Humberto Ortiz-Zuazaga, Airined Montes Mercado + Sarah Lynn Fisher, Maristella Feustle, Whitney Johnson-Freeman
@@ -203,11 +209,11 @@ Remember, the output shown here is probably different than yours, as the website : - Isabel Rivera, Diana Buitrago Escobar, Yabdiel Ramos Valerio + Marcia McIntosh, Trey Clark
- Jun 04 - Jun 10 2025 + Jan 22 - Jan 22 2026
``` @@ -246,11 +252,11 @@ workshop_list = [] for item in divs: dict_workshop = {} dict_workshop['host'] = item.find('h3').get_text() - dict_workshop['link'] = div_firsth3.find('h3').find('a').get('href') - dict_workshop['curriculum'] = div_firsth3.get('data-curriculum') - dict_workshop['country'] = div_firsth3.get('data-country') - dict_workshop['format'] = div_firsth3.get('data-meeting') - dict_workshop['program'] = div_firsth3.get('data-program') + dict_workshop['link'] = item.find('h3').find('a').get('href') # get is used to access attribute values as a dictionary + dict_workshop['curriculum'] = item.get('data-curriculum') + dict_workshop['country'] = item.get('data-country') + dict_workshop['format'] = item.get('data-meeting') + dict_workshop['program'] = item.get('data-program') workshop_list.append(dict_workshop) # Transform list into a DataFrame From 5d4d63cb3e05d38674ab1f2abb773e476d7c6467 Mon Sep 17 00:00:00 2001 From: jairomelo Date: Thu, 22 Jan 2026 13:38:47 -0800 Subject: [PATCH 2/4] Fix link extraction logic in workshop dictionary --- episodes/a-real-website.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/episodes/a-real-website.md b/episodes/a-real-website.md index 59416b5..e2612ec 100644 --- a/episodes/a-real-website.md +++ b/episodes/a-real-website.md @@ -230,7 +230,7 @@ As shown in the previous episode, we can store all this information in a Python # Create an empty dictionary and fill it with the info we are interested in dict_workshop = {} dict_workshop['host'] = div_firsth3.find('h3').get_text() -dict_workshop['link'] = div_firsth3.find('h3').find('a').get('href') +dict_workshop['link'] = div_firsth3.find('a').get('href') dict_workshop['curriculum'] = div_firsth3.get('data-curriculum') dict_workshop['country'] = div_firsth3.get('data-country') dict_workshop['format'] = div_firsth3.get('data-meeting') From 5ce4f61bc802b13b342eb81fe07ea683e772f8e8 Mon Sep 17 00:00:00 2001 From: jairomelo Date: Thu, 22 Jan 2026 13:40:44 -0800 Subject: [PATCH 3/4] Fix link extraction logic in workshop dictionary - is not necessary to iterate over h3 again to get the link --- episodes/a-real-website.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/episodes/a-real-website.md b/episodes/a-real-website.md index e2612ec..88fd9d6 100644 --- a/episodes/a-real-website.md +++ b/episodes/a-real-website.md @@ -252,7 +252,7 @@ workshop_list = [] for item in divs: dict_workshop = {} dict_workshop['host'] = item.find('h3').get_text() - dict_workshop['link'] = item.find('h3').find('a').get('href') # get is used to access attribute values as a dictionary + dict_workshop['link'] = item.find('a').get('href') # get is used to access attribute values as a dictionary dict_workshop['curriculum'] = item.get('data-curriculum') dict_workshop['country'] = item.get('data-country') dict_workshop['format'] = item.get('data-meeting') @@ -283,7 +283,7 @@ workshop_list = [] while child_div is not None: dict_workshop = {} dict_workshop['host'] = child_div.find('h3').get_text() - dict_workshop['link'] = child_div.find('h3').find('a').get('href') + dict_workshop['link'] = child_div.find('a').get('href') dict_workshop['curriculum'] = child_div.get('data-curriculum') dict_workshop['country'] = child_div.get('data-country') dict_workshop['format'] = child_div.get('data-meeting') @@ -325,7 +325,7 @@ workshop_list = [] for item in divs_past: dict_workshop = {} dict_workshop['host'] = item.find('h3').get_text() - dict_workshop['link'] = item.find('h3').find('a').get('href') + dict_workshop['link'] = item.find('a').get('href') dict_workshop['curriculum'] = item.get('data-curriculum') dict_workshop['country'] = item.get('data-country') dict_workshop['format'] = item.get('data-meeting') @@ -362,7 +362,7 @@ We only need to add three lines to our loop, and this is how it would look like. for item in divs: dict_workshop = {} dict_workshop['host'] = item.find('h3').get_text() - dict_workshop['link'] = item.find('h3').find('a')['href'] + dict_workshop['link'] = item.find('a')['href'] dict_workshop['curriculum'] = item.get('data-curriculum') dict_workshop['country'] = item.get('data-country') dict_workshop['format'] = item.get('data-meeting') From e992940ecc60cfe62c8f72b2af0f2caca3e5089d Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Mon, 26 Jan 2026 11:48:43 -0800 Subject: [PATCH 4/4] Undo episode outputs and removal of find(h3) for workshop links --- episodes/a-real-website.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/episodes/a-real-website.md b/episodes/a-real-website.md index 88fd9d6..b224686 100644 --- a/episodes/a-real-website.md +++ b/episodes/a-real-website.md @@ -165,18 +165,18 @@ print(div_firsth3.prettify()) Remember, the output shown here is probably different than yours, as the website is continuously updated. ```output -
+
- + - Library Carpentry + Software Carpentry
- + - United States + Puerto Rico
@@ -187,12 +187,12 @@ Remember, the output shown here is probably different than yours, as the website

- - University of North Texas + + University of Puerto Rico

- Library Carpentry (Intro to Data, Unix Shell, Git, and/or OpenRefine) + Software Carpentry (Shell, Git, R for Reproducible Scientific Analysis)
@@ -200,7 +200,7 @@ Remember, the output shown here is probably different than yours, as the website : - Sarah Lynn Fisher, Maristella Feustle, Whitney Johnson-Freeman + Humberto Ortiz-Zuazaga, Airined Montes Mercado
@@ -209,11 +209,11 @@ Remember, the output shown here is probably different than yours, as the website : - Marcia McIntosh, Trey Clark + Isabel Rivera, Diana Buitrago Escobar, Yabdiel Ramos Valerio
- Jan 22 - Jan 22 2026 + Jun 04 - Jun 10 2025
``` @@ -230,7 +230,7 @@ As shown in the previous episode, we can store all this information in a Python # Create an empty dictionary and fill it with the info we are interested in dict_workshop = {} dict_workshop['host'] = div_firsth3.find('h3').get_text() -dict_workshop['link'] = div_firsth3.find('a').get('href') +dict_workshop['link'] = div_firsth3.find('h3').find('a').get('href') dict_workshop['curriculum'] = div_firsth3.get('data-curriculum') dict_workshop['country'] = div_firsth3.get('data-country') dict_workshop['format'] = div_firsth3.get('data-meeting') @@ -252,7 +252,7 @@ workshop_list = [] for item in divs: dict_workshop = {} dict_workshop['host'] = item.find('h3').get_text() - dict_workshop['link'] = item.find('a').get('href') # get is used to access attribute values as a dictionary + dict_workshop['link'] = item.find('h3').find('a').get('href') # get is used to access attribute values as a dictionary dict_workshop['curriculum'] = item.get('data-curriculum') dict_workshop['country'] = item.get('data-country') dict_workshop['format'] = item.get('data-meeting') @@ -283,7 +283,7 @@ workshop_list = [] while child_div is not None: dict_workshop = {} dict_workshop['host'] = child_div.find('h3').get_text() - dict_workshop['link'] = child_div.find('a').get('href') + dict_workshop['link'] = child_div.find('h3').find('a').get('href') dict_workshop['curriculum'] = child_div.get('data-curriculum') dict_workshop['country'] = child_div.get('data-country') dict_workshop['format'] = child_div.get('data-meeting') @@ -325,7 +325,7 @@ workshop_list = [] for item in divs_past: dict_workshop = {} dict_workshop['host'] = item.find('h3').get_text() - dict_workshop['link'] = item.find('a').get('href') + dict_workshop['link'] = item.find('h3').find('a').get('href') dict_workshop['curriculum'] = item.get('data-curriculum') dict_workshop['country'] = item.get('data-country') dict_workshop['format'] = item.get('data-meeting') @@ -362,7 +362,7 @@ We only need to add three lines to our loop, and this is how it would look like. for item in divs: dict_workshop = {} dict_workshop['host'] = item.find('h3').get_text() - dict_workshop['link'] = item.find('a')['href'] + dict_workshop['link'] = item.find('h3').find('a')['href'] dict_workshop['curriculum'] = item.get('data-curriculum') dict_workshop['country'] = item.get('data-country') dict_workshop['format'] = item.get('data-meeting')