From e75a5fdb957f447997bfeaac9b286a59ec80d46f Mon Sep 17 00:00:00 2001 From: Mauro Date: Sun, 16 Nov 2025 11:45:52 +0100 Subject: [PATCH 1/2] Add stats for CDF and boxplots --- fastplot/__init__.py | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/fastplot/__init__.py b/fastplot/__init__.py index 3f35708..752ff40 100644 --- a/fastplot/__init__.py +++ b/fastplot/__init__.py @@ -47,7 +47,7 @@ def plot(data, path, mode = 'line', linewidth = 1, boxplot_whis=[5,95], timeseries_format='%Y/%m/%d', bars_width=0.6, boxplot_numerousness = False, boxplot_numerousness_fontsize = 'x-small', boxplot_fliersize=0, boxplot_palette=sns.color_palette(), boxplot_empty=False, boxplot_numerousness_rotate=None, - callback = None, timeseries_stacked_right_legend_order=True, CDF_complementary=False, vlines=None, hlines=None, vlines_style={}, hlines_style={}): + callback = None, timeseries_stacked_right_legend_order=True, CDF_complementary=False, vlines=None, hlines=None, vlines_style={}, hlines_style={}, stats=None): # 1. Create and configure plot visual style plt.rcParams.update(plt.rcParamsDefault) @@ -121,6 +121,11 @@ def plot(data, path, mode = 'line', if ylim is None: ylim = (0,1) + if isinstance(stats, dict): + clear_dict(stats) + for k, v in get_distribution_stats(data).items(): + stats[k] = v + elif mode == 'CDF_multi': for s_name, s in data : e = ECDF(s) @@ -149,6 +154,13 @@ def plot(data, path, mode = 'line', if ylim is None: ylim = (0,1) + if isinstance(stats, dict): + clear_dict(stats) + for name, samples in data: + stats[name] = {} + for k, v in get_distribution_stats(samples).items(): + stats[name][k] = v + elif mode == 'boxplot': labels = [e[0] for e in data] samples = [e[1] for e in data] @@ -177,6 +189,13 @@ def plot(data, path, mode = 'line', plt.setp(plt.gca().patches, edgecolor = 'black', facecolor='white', linewidth =1) plt.setp(plt.gca().lines, color='black', linewidth =1) + if isinstance(stats, dict): + clear_dict(stats) + for name, samples in data: + stats[name] = {} + for k, v in get_distribution_stats(samples).items(): + stats[name][k] = v + elif mode == 'boxplot_multi': new_data = [] for c in data: @@ -189,6 +208,13 @@ def plot(data, path, mode = 'line', p.legend().remove() plt.xlabel("") plt.gca().set_xticklabels(data.index) + + if isinstance(stats, dict): + clear_dict(stats) + for c in data: + stats[c] = {} + for index, values in data[c].items(): + stats[c][index] = get_distribution_stats(values) elif mode == 'timeseries': plt.plot(data, markeredgewidth=0, linewidth = linewidth, **plot_args) @@ -340,6 +366,22 @@ def tex_escape(text): regex = re.compile('|'.join(re.escape(str(key)) for key in sorted(conv.keys(), key = lambda item: - len(item)))) return regex.sub(lambda match: conv[match.group()], text) + +def get_distribution_stats(data): + stats = {} + stats['min'] = np.min(data) + stats['q1'] = np.percentile(data, 25) + stats['median'] = np.median(data) + stats['q3'] = np.percentile(data, 75) + stats['max'] = np.max(data) + stats['mean'] = np.mean(data) + stats['std'] = np.std(data) + return stats + +def clear_dict(d): + for k in list(d.keys()): + del d[k] + def gini(arr): count = arr.size coefficient = 2 / count From 8b1a214760e6c2713b2091f9c07bdf375a17351b Mon Sep 17 00:00:00 2001 From: Mauro Date: Mon, 17 Nov 2025 11:01:17 +0100 Subject: [PATCH 2/2] Add matplotlib ecdf --- fastplot/__init__.py | 83 +++++++++++++++++++++++++------------------- requirements.txt | 1 + setup.py | 2 +- 3 files changed, 50 insertions(+), 36 deletions(-) diff --git a/fastplot/__init__.py b/fastplot/__init__.py index 752ff40..9e36563 100644 --- a/fastplot/__init__.py +++ b/fastplot/__init__.py @@ -10,8 +10,12 @@ import numpy as np import pandas as pd import re +from packaging import version from statsmodels.distributions.empirical_distribution import ECDF +MPL_VERSION = mpl.__version__ +HAS_ECDF_PLOT = version.parse(MPL_VERSION) >= version.parse('3.8.0') # 3.8.0 was released in September 2023 + # Register Pandas Converters from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() @@ -96,38 +100,12 @@ def plot(data, path, mode = 'line', plt.plot(data[0], data[1], markeredgewidth=0, linewidth = linewidth, **plot_args) elif mode == 'CDF': - s = data - e = ECDF(s) - if xscale == 'log': - x = np.logspace(np.log10(min(s)), np.log10(max(s)), NUM_BIN_CDF ) - if CDF_complementary: - y = 1-e(x) - else: - y = e(x) + if HAS_ECDF_PLOT: + # Use plt.ecdf for matplotlib 3.8+ + plt.ecdf(data, complementary=CDF_complementary, linewidth=linewidth, **plot_args) else: - x = np.linspace(min(s), max(s), NUM_BIN_CDF ) - if CDF_complementary: - y = 1-e(x) - x = np.concatenate( (np.array([min(s)]), x) ) - y = np.concatenate( (np.array([1]), y) ) - else: - y = e(x) - x = np.concatenate( (np.array([min(s)]), x) ) - y = np.concatenate( (np.array([0]), y) ) - - plt.plot(x,y, linewidth = linewidth, **plot_args) - if ylabel is None: - ylabel = 'CCDF' if CDF_complementary else "CDF" - if ylim is None: - ylim = (0,1) - - if isinstance(stats, dict): - clear_dict(stats) - for k, v in get_distribution_stats(data).items(): - stats[k] = v - - elif mode == 'CDF_multi': - for s_name, s in data : + # Fallback to statsmodels for older matplotlib versions + s = data e = ECDF(s) if xscale == 'log': x = np.logspace(np.log10(min(s)), np.log10(max(s)), NUM_BIN_CDF ) @@ -137,7 +115,6 @@ def plot(data, path, mode = 'line', y = e(x) else: x = np.linspace(min(s), max(s), NUM_BIN_CDF ) - if CDF_complementary: y = 1-e(x) x = np.concatenate( (np.array([min(s)]), x) ) @@ -147,7 +124,44 @@ def plot(data, path, mode = 'line', x = np.concatenate( (np.array([min(s)]), x) ) y = np.concatenate( (np.array([0]), y) ) - plt.plot(x,y, label=s_name, linewidth = linewidth, **plot_args) + plt.plot(x,y, linewidth = linewidth, **plot_args) + if ylabel is None: + ylabel = 'CCDF' if CDF_complementary else "CDF" + if ylim is None: + ylim = (0,1) + + if isinstance(stats, dict): + clear_dict(stats) + for k, v in get_distribution_stats(data).items(): + stats[k] = v + + elif mode == 'CDF_multi': + for s_name, s in data: + if HAS_ECDF_PLOT: + # Use plt.ecdf for matplotlib 3.8+ + plt.ecdf(s, label=s_name, complementary=CDF_complementary, linewidth=linewidth, **plot_args) + else: + # Fallback for older matplotlib versions + e = ECDF(s) + if xscale == 'log': + x = np.logspace(np.log10(min(s)), np.log10(max(s)), NUM_BIN_CDF ) + if CDF_complementary: + y = 1-e(x) + else: + y = e(x) + else: + x = np.linspace(min(s), max(s), NUM_BIN_CDF ) + + if CDF_complementary: + y = 1-e(x) + x = np.concatenate( (np.array([min(s)]), x) ) + y = np.concatenate( (np.array([1]), y) ) + else: + y = e(x) + x = np.concatenate( (np.array([min(s)]), x) ) + y = np.concatenate( (np.array([0]), y) ) + + plt.plot(x,y, label=s_name, linewidth = linewidth, **plot_args) if ylabel is None: ylabel = 'CCDF' if CDF_complementary else "CDF" @@ -407,5 +421,4 @@ def lorenz_gini_multi(data, name_format="{} (GI={:0.2f})"): name_new = name_format.format(name, gini_index) data_new.append( (name_new, (lorenz_x,lorenz_y) ) ) return data_new - - + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 32296f5..68131fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ pandas statsmodels scipy seaborn +packaging diff --git a/setup.py b/setup.py index d0c077a..5fb3a6b 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ url="https://github.com/marty90/fastplot", download_url = 'https://github.com/marty90/fastplot/tarball/1.5.0', packages=['fastplot'], - install_requires=['matplotlib', 'numpy', 'pandas', 'statsmodels', 'scipy', 'seaborn'] + install_requires=['matplotlib', 'numpy', 'pandas', 'statsmodels', 'scipy', 'seaborn', 'packaging'] ) # Upload on pip with: