From 6aa0d9f23e3d9e67aa866bdc1d61e3b555e9cfb2 Mon Sep 17 00:00:00 2001 From: psemdel Date: Thu, 18 May 2023 10:41:13 +0200 Subject: [PATCH 1/3] Create a distinction between old style race and new style race. More tests added, more code in parser to solve the youth issue --- first_cycling_api/parser.py | 8 +- first_cycling_api/race/endpoints.py | 27 ++++-- tests/test_race.py | 132 ++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 10 deletions(-) diff --git a/first_cycling_api/parser.py b/first_cycling_api/parser.py index 54ba3eb..a922d8f 100644 --- a/first_cycling_api/parser.py +++ b/first_cycling_api/parser.py @@ -75,8 +75,12 @@ def parse_table(table): out_df[col] = out_df[col].astype(str).str.replace('.', '', regex=False).astype(int) # Parse soup to add information hidden in tags/links - headers = [th.text for th in table.tr.find_all('th')] - trs = table.find_all('tr')[1:] + if len([th.text for th in table.tr.find_all('th')])==0: #bug with youth, as a is missing + trs = table.find_all('tr')[0:] + else: + trs = table.find_all('tr')[1:] + + headers = [th.text for th in table.thead.find_all('th')] if 'Race.1' in out_df: out_df = out_df.rename(columns={'Race': 'Race_Country', 'Race.1': 'Race'}) diff --git a/first_cycling_api/race/endpoints.py b/first_cycling_api/race/endpoints.py index ecdec79..c325c35 100644 --- a/first_cycling_api/race/endpoints.py +++ b/first_cycling_api/race/endpoints.py @@ -48,7 +48,10 @@ def _get_victory_table(self): victory_table = self.soup.find('table', {'class': 'tablesorter'}) self.table = parse_table(victory_table) - +class Standing(): + def __init__(self, results_table): + self.results_table=results_table + class RaceStageVictories(RaceEndpoint): """ Race stage victory table response. Extends RaceEndpoint. @@ -86,14 +89,22 @@ def _parse_soup(self): self._get_sidebar_information() def _get_results_table(self): - results_table = self.soup.find('table', {'class': 'sortTabell'}) + results_table = self.soup.find('table', {'class': 'sortTabell tablesorter'}) if not results_table: - results_table = self.soup.find('table', {'class': 'sortTabell2'}) - self.results_table = parse_table(results_table) - - # Load all classification standings after stage - divs = self.soup.find_all('div', {'class': "tab-content dummy"}) - self.standings = {div['id']: parse_table(div.table) for div in divs} + results_table = self.soup.find('table', {'class': 'sortTabell2 tablesorter'}) + + if results_table: #old race type + self.results_table = parse_table(results_table) + + # Load all classification standings after stage + divs = self.soup.find_all('div', {'class': "tab-content dummy"}) + self.standings = {div['id']: Standing(parse_table(div.table)) for div in divs} #may not work and require the use of l=classification num + + else: #new race type + divs = self.soup.find_all('div', {'class': "tab-content"}) #includes also tab-content results + self.standings= {div['id']: Standing(parse_table(div.table)) for div in divs} + + self.results_table = self.standings[divs[0]['id']].results_table #first appearing is the result def _get_sidebar_information(self): # TODO return diff --git a/tests/test_race.py b/tests/test_race.py index c368778..7c0b9f5 100644 --- a/tests/test_race.py +++ b/tests/test_race.py @@ -27,3 +27,135 @@ def test_2023_amstel(): results_2023 = amstel_2023.results() assert len(results_2023.results_table) == 175 assert results_2023.results_table['Rider'].iloc[0] == 'Pogacar Tadej' + +#LBL uses the new style, single day race +def test_2023_lbl_women(): + lbl = Race(9052) + lbl_2023 = lbl.edition(year=2023) + results_2023 = lbl_2023.results() + assert len(results_2023.results_table) == 140 + assert results_2023.results_table['Rider'].iloc[0] == 'Vollering Demi' + +#TdF uses the old style, stage race +def test_2022_TdF(): + tdf= Race(17) + tdf_2022 = tdf.edition(year=2022) + results_2022 = tdf_2022.results() + assert len(results_2022.results_table) == 176 + assert results_2022.results_table['Rider'].iloc[0] == 'Vingegaard Jonas' + assert results_2022.results_table['Time'].iloc[0] == "79:33:20" + assert results_2022.results_table['Pos'].iloc[0] == "01" + + r=tdf_2022.results(classification_num=1).results_table + assert len(r) == 176 + assert r['Rider'].iloc[0] == 'Vingegaard Jonas' + assert r['Time'].iloc[0] == "79:33:20" + + r=tdf_2022.results(classification_num=2).results_table + assert len(r) == 26 + assert r['Rider'].iloc[0] == "Pogacar Tadej" + assert r['Time'].iloc[0] == "79:36:03" + + r=tdf_2022.results(classification_num=3).results_table + assert len(r) == 119 + assert r['Rider'].iloc[0] == "van Aert Wout" + assert r['Points'].iloc[0] == 480 + + r=tdf_2022.results(classification_num=4).results_table + assert len(r) == 55 + assert r['Rider'].iloc[0] == 'Vingegaard Jonas' + assert r['Points'].iloc[0] == 72 + + assert len(tdf_2022.results(classification_num=8).results_table) == 22 + + results_st1 = tdf_2022.results(stage_num=1) + assert len(results_st1.results_table) == 176 + assert results_st1.results_table['Rider'].iloc[0] == 'Lampaert Yves' + +#Itzulia uses the new style, stage race +def test_2023_itzulia(): + race = Race(14244) + r_2023 = race.edition(year=2023) + + #general + results_2023 = r_2023.results() + assert len(results_2023.results_table) == 113 + assert results_2023.results_table['Rider'].iloc[0] == 'Reusser Marlen' + + ### Following tests work after this PR + assert 'gc' in results_2023.standings + assert 'point' in results_2023.standings + assert 'mountain' in results_2023.standings + assert 'youth' in results_2023.standings + + r=r_2023.results(classification_num=1).results_table + assert len(r) == 113 + assert r['Rider'].iloc[0] == 'Reusser Marlen' + assert r['Time'].iloc[0] == "09:57:24" + + ###Following tests don't work with this PR, more code is required + #r=r_2023.results(classification_num=2).results_table + #assert len(r) == 18 + #assert r['Rider'].iloc[0] == 'Wyllie Ella' + #assert r['Time'].iloc[0] == "10:04:05" + + #r=r_2023.results(classification_num=3).results_table + #assert len(r) == 24 + #assert r['Rider'].iloc[0] == 'Reusser Marlen' + #assert r['Points'].iloc[0] == 79 + + #r=r_2023.results(classification_num=4).results_table + #assert len(r) == 14 + #assert r['Rider'].iloc[0] == 'Vollering Demi' + #assert r['Points'].iloc[0] == 15 + + #stage + results_2023 = r_2023.results(stage_num=1) + assert len(results_2023.results_table) == 113 + assert results_2023.results_table['Rider'].iloc[0] == 'Vollering Demi' + + ### Following tests work after this PR + assert 'gc' in results_2023.standings + assert 'point' in results_2023.standings + assert 'mountain' in results_2023.standings + assert 'youth' in results_2023.standings + + ###Following tests don't work with this PR, more code is required + + #r=r_2023.results(stage_num=1,classification_num=1).results_table + #assert len(r) == 97 + #assert r['Rider'].iloc[0] == 'Vollering Demi' + #assert r['Time'].iloc[0] == "03:16:22" + + #r=r_2023.results(stage_num=1,classification_num=2).results_table + #assert len(r) == 23 + #assert r['Rider'].iloc[0] == 'Wyllie Ella' + #assert r['Time'].iloc[0] == "03:19:38" + + #r=r_2023.results(stage_num=1,classification_num=3).results_table + #assert len(r) == 19 + #assert r['Rider'].iloc[0] == 'Vollering Demi' + #assert r['Points'].iloc[0] == 25 + + #r=r_2023.results(stage_num=1,classification_num=4).results_table + #assert len(r) == 7 + #assert r['Rider'].iloc[0] == 'Vollering Demi' + #assert r['Points'].iloc[0] == 6 + + #assert len(r_2023.results(stage_num=1,classification_num=8).results_table) == 19 + +my_vcr.use_cassette() #Is it normal that it is no decorator??? +def test_2014_giro_rosa_prologue(): + giro_rosa_2014 = RaceEdition(race_id=9064, year=2014) + results = giro_rosa_2014.results(stage_num=0) + assert results.results_table['Rider'].iloc[0] == 'van Vleuten Annemiek' + +def test_giro_donne_2001(): + giro_rosa_2001 = RaceEdition(race_id=9064, year=2001, ) + results = giro_rosa_2001.results(stage_num=1) + assert len(results.results_table) == 10 + + ###Following tests don't work with this PR, more code is required + + #results = giro_rosa_2001.results(stage_num=1,classification_num=3) #not existing + #assert results==None \ No newline at end of file From a3a523c709965b4356edf04a0ba4f21c88065c4e Mon Sep 17 00:00:00 2001 From: psemdel Date: Sat, 17 Jun 2023 20:53:50 +0200 Subject: [PATCH 2/3] fix result_table --- tests/test_race.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_race.py b/tests/test_race.py index 8906555..6b4dcb0 100644 --- a/tests/test_race.py +++ b/tests/test_race.py @@ -176,6 +176,6 @@ def test_2023_basque(): assert len(results_2023.results_table) == 161 assert results_2023.results_table['Rider'].iloc[0] == 'Vingegaard Jonas' - assert len(results_2023.standings['youth']) == 26 - assert results_2023.standings['youth']['Rider'].iloc[0] == 'McNulty Brandon' + assert len(results_2023.standings['youth'].results_table) == 26 + assert results_2023.standings['youth'].results_table['Rider'].iloc[0] == 'McNulty Brandon' From 3f99fee15b78d82926ac66525678b206b2d49f75 Mon Sep 17 00:00:00 2001 From: psemdel Date: Thu, 22 Jun 2023 21:13:10 +0200 Subject: [PATCH 3/3] Combi, RaceEditionStartlist --- first_cycling_api/combi.py | 57 ++++++++++++++++++++ first_cycling_api/parser.py | 17 +++++- first_cycling_api/race/endpoints.py | 26 ++++++++- first_cycling_api/race/race.py | 4 +- tests/test_combi.py | 83 +++++++++++++++++++++++++++++ tests/test_race.py | 75 +++++++------------------- tests/test_rider.py | 3 ++ 7 files changed, 205 insertions(+), 60 deletions(-) create mode 100644 first_cycling_api/combi.py create mode 100644 tests/test_combi.py diff --git a/first_cycling_api/combi.py b/first_cycling_api/combi.py new file mode 100644 index 0000000..3348871 --- /dev/null +++ b/first_cycling_api/combi.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun May 14 15:39:13 2023 + +@author: maxime +""" +from .race.race import RaceEdition + +def combi_results_startlist(race_id, year,**kwargs): + try: + r=RaceEdition(race_id=race_id,year=year) + t=r.results(**kwargs) + + if t is None or ("results_table" in t.__dir__() and t.results_table is None): + #case of race not completed yet + r=RaceEdition(race_id=race_id,year=year) + kwargs.update(stage_num=1) + t=r.results(**kwargs) + if t is None or ("results_table" in t.__dir__() and (t.results_table is None or not "Inv name" in t.results_table.columns)): + #fallback TTT + kwargs.update(stage_num=2) + t=r.results(**kwargs) + + if "results_table" in t.__dir__(): + results_table=t.results_table + else: + results_table=t + + print(results_table) + print("Inv name" in results_table.columns) + + start_list=r.startlist() + + """ Convert HTML table from bs4 to pandas DataFrame. Return None if no data. """ + # TODO for rider results, format dates nicely with hidden column we are throwing away + + if "Inv name" in results_table.columns: + for i in results_table.index: + try: + results_table.loc[i,"BIB"]=start_list.bib_df.loc[results_table.loc[i,"Inv name"]]["BIB"] + except: + print(results_table.loc[i,"Inv name"] + " not found in the start list") + results_table.loc[i,"BIB"]=0 + t.results_table=results_table + else: + print("No Inv name in results_table, the stage may be a TTT") + return None + + return t + except Exception as msg: + import sys + _, _, exc_tb = sys.exc_info() + print("line " + str(exc_tb.tb_lineno)) + print(msg) + + diff --git a/first_cycling_api/parser.py b/first_cycling_api/parser.py index 63d7424..fe57ea1 100644 --- a/first_cycling_api/parser.py +++ b/first_cycling_api/parser.py @@ -82,12 +82,27 @@ def parse_table(table): if 'Race.1' in out_df: out_df = out_df.rename(columns={'Race': 'Race_Country', 'Race.1': 'Race'}) headers.insert(headers.index('Race'), 'Race_Country') - + + for col in out_df.columns: #problems with \nRider\n + if "Rider" in col: + out_df = out_df.rename(columns={col: 'Rider'}) + break + for i, col in enumerate(headers): #problems with \nRider\n + if "Rider" in col: + headers[i]='Rider' + break + soup_df = pd.DataFrame([tr.find_all('td') for tr in trs], columns=headers) # Add information hidden in tags for col, series in soup_df.items(): if col in ('Rider', 'Winner', 'Second', 'Third'): + if col =="Rider": + out_df["Rider"]=out_df["Rider"].str.replace("[*]","",regex=False) + out_df["Rider"]=out_df["Rider"].str.replace("*","",regex=False) + out_df["Rider"]=out_df["Rider"].str.replace(" "," " ,regex=False) + out_df["Inv name"]=out_df["Rider"].str.lower() + out_df[col + '_ID'] = series.apply(lambda td: rider_link_to_id(td.a)) try: out_df[col + '_Country'] = series.apply(lambda td: img_to_country_code(td.img)) diff --git a/first_cycling_api/race/endpoints.py b/first_cycling_api/race/endpoints.py index cae976d..763add5 100644 --- a/first_cycling_api/race/endpoints.py +++ b/first_cycling_api/race/endpoints.py @@ -1,6 +1,6 @@ from ..endpoints import ParsedEndpoint from ..parser import parse_table - +import pandas as pd class RaceEndpoint(ParsedEndpoint): """ @@ -110,3 +110,27 @@ def _get_results_table(self): def _get_sidebar_information(self): # TODO return + +class RaceEditionStartlist(RaceEndpoint): + def _parse_soup(self): + super()._parse_soup() + self._get_results_table() + + def _get_results_table(self): + tables = self.soup.find_all('table', {'class': 'tablesorter'}) + + arr=[] + + for t in tables: + sub_df=pd.read_html(str(t), decimal=',')[0] + sub_df.columns=["BIB","Inv name"] + sub_df["Inv name"]=sub_df["Inv name"].str.lower() + sub_df["Inv name"]=sub_df["Inv name"].str.replace("[*]","",regex=False) + sub_df["Inv name"]=sub_df["Inv name"].str.replace(" *","",regex=False) + sub_df["Inv name"]=sub_df["Inv name"].str.replace("*","",regex=False) + sub_df["Inv name"]=sub_df["Inv name"].str.replace(" "," " ,regex=False) + + arr.append(sub_df) + + bib_df =pd.concat(arr) + self.bib_df = bib_df.set_index(bib_df["Inv name"]) \ No newline at end of file diff --git a/first_cycling_api/race/race.py b/first_cycling_api/race/race.py index d42d9bf..7ecb9ea 100644 --- a/first_cycling_api/race/race.py +++ b/first_cycling_api/race/race.py @@ -1,5 +1,5 @@ from ..objects import FirstCyclingObject -from .endpoints import RaceEndpoint, RaceVictoryTable, RaceStageVictories, RaceEditionResults +from .endpoints import RaceEndpoint, RaceVictoryTable, RaceStageVictories, RaceEditionResults, RaceEditionStartlist from ..api import fc from ..constants import Classification @@ -161,7 +161,7 @@ def startlist(self): ------- RaceEndpoint """ - return self._get_endpoint(k=8) + return self._get_endpoint(k=8,endpoint=RaceEditionStartlist) def startlist_extended(self): """ diff --git a/tests/test_combi.py b/tests/test_combi.py new file mode 100644 index 0000000..4ee034e --- /dev/null +++ b/tests/test_combi.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun May 14 16:26:19 2023 + +@author: maxime +""" + +from first_cycling_api.combi import combi_results_startlist +import numpy as np + +def test_combi_2019_amstel(): + t = combi_results_startlist(9,2019) + + assert len(t.results_table) == 175 + assert t.results_table['Rider'].iloc[0] == 'van der Poel Mathieu' + assert t.results_table['BIB'].iloc[0] ==181 + +def test_2022_TdF(): + t = combi_results_startlist(17,2022) + + assert len(t.results_table) == 176 + assert t.results_table['Rider'].iloc[0] == 'Vingegaard Jonas' + assert t.results_table['BIB'].iloc[0] == 18 + + t = combi_results_startlist(17,2022,classification_num=1) + assert len(t.results_table) == 176 + assert t.results_table['Rider'].iloc[0] == 'Vingegaard Jonas' + assert t.results_table['BIB'].iloc[0] == 18 + + t = combi_results_startlist(17,2022,classification_num=2) + assert len(t.results_table) == 26 + assert t.results_table['Rider'].iloc[0] == "Pogacar Tadej" + assert t.results_table['Time'].iloc[0] == "79:36:03" + + + +def test_combi_2023_itzulia(): + t = combi_results_startlist(14244,2023,stage_num=1) + + assert len(t.results_table) == 113 + assert t.results_table['Rider'].iloc[0] == 'Vollering Demi' + assert t.results_table['BIB'].iloc[0] ==1 + + assert 'gc' in t.standings + assert 'point' in t.standings + assert 'mountain' in t.standings + assert 'youth' in t.standings + + t = combi_results_startlist(14244,2023,stage_num=1,classification_num=1) + assert t.results_table['Rider'].iloc[0] == 'Vollering Demi' + assert t.results_table['BIB'].iloc[0] ==1 + + t = combi_results_startlist(14244,2023,stage_num=1,classification_num=3) + assert t.results_table['Rider'].iloc[0] == 'Vollering Demi' + assert t.results_table['BIB'].iloc[0] ==1 + +def test_combi_2023_gracia(): + t = combi_results_startlist(9549,2023,stage_num=3) + + assert len(t.results_table) == 128 + assert t.results_table['Rider'].iloc[0] == 'Rissveds Jenny' + assert t.results_table['BIB'].iloc[0] ==73 + + assert 'gc' in t.standings + assert 'point' in t.standings + assert 'mountain' in t.standings + assert 'youth' in t.standings + + #t = combi_results_startlist(9549,2023,stage_num=3,classification_num=3) + #assert t.results_table['Rider'].iloc[0] == 'Wlodarczyk Dominika' + +def test_giro_donne_2001(): + t = combi_results_startlist(9064,2001,stage_num=1) + assert len(t.results_table) == 10 + + #t = combi_results_startlist(9064,2001,stage_num=1,classification_num=3) #not existing + #assert t==None + + + + + diff --git a/tests/test_race.py b/tests/test_race.py index 6b4dcb0..52c4d27 100644 --- a/tests/test_race.py +++ b/tests/test_race.py @@ -4,6 +4,8 @@ my_vcr = vcr.VCR(cassette_library_dir='tests/vcr_cassettes/race', path_transformer=vcr.VCR.ensure_suffix('.yaml')) +#Amstel uses the old style, single day race + @my_vcr.use_cassette() def test_2019_amstel(): amstel = Race(9) @@ -11,30 +13,14 @@ def test_2019_amstel(): results_2019 = amstel_2019.results() assert len(results_2019.results_table) == 175 assert results_2019.results_table['Rider'].iloc[0] == 'van der Poel Mathieu' - - -@my_vcr.use_cassette() -def test_2014_giro_rosa_prologue(): - giro_rosa_2014 = RaceEdition(race_id=9064, year=2014) - results = giro_rosa_2014.results(stage_num=0) - assert results.results_table['Rider'].iloc[0] == 'van Vleuten Annemiek' - - -@my_vcr.use_cassette() -def test_2023_amstel(): - amstel = Race(9) - amstel_2023 = amstel.edition(year=2023) - results_2023 = amstel_2023.results() - assert len(results_2023.results_table) == 175 - assert results_2023.results_table['Rider'].iloc[0] == 'Pogacar Tadej' - + #LBL uses the new style, single day race def test_2023_lbl_women(): lbl = Race(9052) lbl_2023 = lbl.edition(year=2023) results_2023 = lbl_2023.results() assert len(results_2023.results_table) == 140 - assert results_2023.results_table['Rider'].iloc[0] == 'Vollering Demi' + assert results_2023.results_table['Rider'].iloc[0] == 'Vollering Demi' #TdF uses the old style, stage race def test_2022_TdF(): @@ -77,51 +63,15 @@ def test_2023_itzulia(): race = Race(14244) r_2023 = race.edition(year=2023) - #general - results_2023 = r_2023.results() - assert len(results_2023.results_table) == 113 - assert results_2023.results_table['Rider'].iloc[0] == 'Reusser Marlen' - - ### Following tests work after this PR - assert 'gc' in results_2023.standings - assert 'point' in results_2023.standings - assert 'mountain' in results_2023.standings - assert 'youth' in results_2023.standings - - r=r_2023.results(classification_num=1).results_table - assert len(r) == 113 - assert r['Rider'].iloc[0] == 'Reusser Marlen' - assert r['Time'].iloc[0] == "09:57:24" - - ###Following tests don't work with this PR, more code is required - #r=r_2023.results(classification_num=2).results_table - #assert len(r) == 18 - #assert r['Rider'].iloc[0] == 'Wyllie Ella' - #assert r['Time'].iloc[0] == "10:04:05" - - #r=r_2023.results(classification_num=3).results_table - #assert len(r) == 24 - #assert r['Rider'].iloc[0] == 'Reusser Marlen' - #assert r['Points'].iloc[0] == 79 - - #r=r_2023.results(classification_num=4).results_table - #assert len(r) == 14 - #assert r['Rider'].iloc[0] == 'Vollering Demi' - #assert r['Points'].iloc[0] == 15 - - #stage results_2023 = r_2023.results(stage_num=1) assert len(results_2023.results_table) == 113 assert results_2023.results_table['Rider'].iloc[0] == 'Vollering Demi' - ### Following tests work after this PR assert 'gc' in results_2023.standings assert 'point' in results_2023.standings assert 'mountain' in results_2023.standings assert 'youth' in results_2023.standings - ###Following tests don't work with this PR, more code is required - #r=r_2023.results(stage_num=1,classification_num=1).results_table #assert len(r) == 97 #assert r['Rider'].iloc[0] == 'Vollering Demi' @@ -143,7 +93,21 @@ def test_2023_itzulia(): #assert r['Points'].iloc[0] == 6 #assert len(r_2023.results(stage_num=1,classification_num=8).results_table) == 19 - + +my_vcr.use_cassette() #Is it normal that it is no decorator??? +def test_2014_giro_rosa_prologue(): + giro_rosa_2014 = RaceEdition(race_id=9064, year=2014) + results = giro_rosa_2014.results(stage_num=0) + assert results.results_table['Rider'].iloc[0] == 'van Vleuten Annemiek' + +@my_vcr.use_cassette() +def test_2023_amstel(): + amstel = Race(9) + amstel_2023 = amstel.edition(year=2023) + results_2023 = amstel_2023.results() + assert len(results_2023.results_table) == 175 + assert results_2023.results_table['Rider'].iloc[0] == 'Pogacar Tadej' + def test_giro_donne_2001(): giro_rosa_2001 = RaceEdition(race_id=9064, year=2001, ) results = giro_rosa_2001.results(stage_num=1) @@ -167,7 +131,6 @@ def test_2022_basque(): assert len(results_2022_yc.results_table) == 11 assert results_2022_yc.results_table['Rider'].iloc[0] == 'Evenepoel Remco' - @my_vcr.use_cassette() def test_2023_basque(): basque = Race(6) diff --git a/tests/test_rider.py b/tests/test_rider.py index fef7f26..8d2e9bc 100644 --- a/tests/test_rider.py +++ b/tests/test_rider.py @@ -8,4 +8,7 @@ def test_roglic_2020_results(): roglic = Rider(18655) results_2020 = roglic.year_results(2020) + #assert results_2020.sidebar_details['nation'] == 'Slovenia' + #assert results_2020.sidebar_details['height'] == 1.77 assert results_2020.results_df['UCI'].max() == 850 +