From a2b9e6f3d0938582fa9229e6db1621a39271b63e Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Mon, 26 Jan 2026 15:51:15 +0100 Subject: [PATCH 01/12] add populated_duration property --- src/osekit/core_api/base_data.py | 7 ++++++- src/osekit/core_api/spectro_data.py | 15 ++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/osekit/core_api/base_data.py b/src/osekit/core_api/base_data.py index 0406deab..a39895db 100644 --- a/src/osekit/core_api/base_data.py +++ b/src/osekit/core_api/base_data.py @@ -12,7 +12,7 @@ from typing import Self, TypeVar import numpy as np -from pandas import Timestamp, date_range +from pandas import Timedelta, Timestamp, date_range from osekit.config import ( DPDEFAULT, @@ -129,6 +129,11 @@ def end(self, value: Timestamp) -> None: for item in self.items: item.end = min(item.end, value) + @property + def populated_duration(self) -> Timedelta: + """Total duration of the non-empty parts of the data.""" + return Timedelta(sum(item.duration for item in self.items if not item.is_empty)) + def get_value(self) -> np.ndarray: """Get the concatenated values from all Items.""" return np.concatenate([item.get_value() for item in self.items]) diff --git a/src/osekit/core_api/spectro_data.py b/src/osekit/core_api/spectro_data.py index c9c32841..297867ac 100644 --- a/src/osekit/core_api/spectro_data.py +++ b/src/osekit/core_api/spectro_data.py @@ -31,7 +31,7 @@ if TYPE_CHECKING: from pathlib import Path - from pandas import Timestamp + from pandas import Timedelta, Timestamp from osekit.core_api.frequency_scale import Scale @@ -248,6 +248,19 @@ def v_lim(self, v_lim: tuple[float, float] | None) -> None: v_lim = (-120.0, 0.0) if self.db_type == "FS" else (0.0, 170.0) self._v_lim = v_lim + @property + def populated_duration(self) -> Timedelta: + """Override BaseData.populated_duration. + + If the SpectroData has no associated file, it will return the + populated duration of the associated AudioData. + """ + if self.files: + return super().populated_duration + if not self.audio_data: + return Timedelta(0) + return self.audio_data.populated_duration + def get_value(self) -> np.ndarray: """Return the Sx matrix of the spectrogram. From 25dffa4de4d690b749579fabd9376e2af3ff7946 Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Mon, 26 Jan 2026 15:51:34 +0100 Subject: [PATCH 02/12] add populated_ratio property --- src/osekit/core_api/base_data.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/osekit/core_api/base_data.py b/src/osekit/core_api/base_data.py index a39895db..ea2f9667 100644 --- a/src/osekit/core_api/base_data.py +++ b/src/osekit/core_api/base_data.py @@ -134,6 +134,11 @@ def populated_duration(self) -> Timedelta: """Total duration of the non-empty parts of the data.""" return Timedelta(sum(item.duration for item in self.items if not item.is_empty)) + @property + def populated_ratio(self) -> float: + """Percentage of the non-empty parts of the data.""" + return self.populated_duration / self.duration + def get_value(self) -> np.ndarray: """Get the concatenated values from all Items.""" return np.concatenate([item.get_value() for item in self.items]) From 95c4216feff74be3ec067b599796b61bb67952bd Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Mon, 26 Jan 2026 16:02:19 +0100 Subject: [PATCH 03/12] add sum function start Timedelta(0) --- src/osekit/core_api/base_data.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/osekit/core_api/base_data.py b/src/osekit/core_api/base_data.py index ea2f9667..5846c9a7 100644 --- a/src/osekit/core_api/base_data.py +++ b/src/osekit/core_api/base_data.py @@ -132,7 +132,12 @@ def end(self, value: Timestamp) -> None: @property def populated_duration(self) -> Timedelta: """Total duration of the non-empty parts of the data.""" - return Timedelta(sum(item.duration for item in self.items if not item.is_empty)) + return Timedelta( + sum( + (item.duration for item in self.items if not item.is_empty), + start=Timedelta(0), + ), + ) @property def populated_ratio(self) -> float: From 8f834526660da2c372e5717241b37d9970f9c806 Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Mon, 26 Jan 2026 16:08:48 +0100 Subject: [PATCH 04/12] add populated duration and ratio tests --- tests/test_core_api_base.py | 107 ++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/tests/test_core_api_base.py b/tests/test_core_api_base.py index c394f292..b7e1d64e 100644 --- a/tests/test_core_api_base.py +++ b/tests/test_core_api_base.py @@ -2481,3 +2481,110 @@ def test_dummydataset_data_from_dict() -> None: )[0] == dd1 ) + + +@pytest.mark.parametrize( + ("files", "begin", "end", "expected_pop_duration", "expected_pop_ratio"), + [ + pytest.param( + [ + DummyFile( + path=Path("foo"), + begin=Timestamp("2009-02-24 00:00:00"), + ), + ], + Timestamp("2009-02-24 00:00:00"), + Timestamp("2009-02-24 00:00:01"), + Timedelta(seconds=1), + 1.0, + id="one-full-file", + ), + pytest.param( + [ + DummyFile( + path=Path("foo"), + begin=Timestamp("2009-02-24 00:00:00"), + ), + ], + Timestamp("2009-02-24 00:00:00.4"), + Timestamp("2009-02-24 00:00:00.6"), + Timedelta(seconds=0.2), + 1.0, + id="one-full-file-part", + ), + pytest.param( + [ + DummyFile( + path=Path("foo"), + begin=Timestamp("2009-02-24 00:00:00"), + ), + ], + Timestamp("2009-02-24 00:00:00.5"), + Timestamp("2009-02-24 00:00:01.5"), + Timedelta(seconds=0.5), + 0.5, + id="one-file-part-with-empty-item", + ), + pytest.param( + [ + DummyFile( + path=Path("foo"), + begin=Timestamp("2009-02-24 00:00:00"), + ), + DummyFile( + path=Path("bar"), + begin=Timestamp("2009-02-24 00:00:01"), + ), + ], + Timestamp("2009-02-24 00:00:00"), + Timestamp("2009-02-24 00:00:02"), + Timedelta(seconds=2), + 1, + id="two-full-consecutive-files", + ), + pytest.param( + [ + DummyFile( + path=Path("foo"), + begin=Timestamp("2009-02-24 00:00:00"), + ), + DummyFile( + path=Path("bar"), + begin=Timestamp("2009-02-24 00:00:02"), + ), + ], + Timestamp("2009-02-24 00:00:00"), + Timestamp("2009-02-24 00:00:03"), + Timedelta(seconds=2), + 2 / 3, + id="two-full-files-with-empty-gap", + ), + pytest.param( + [ + DummyFile( + path=Path("foo"), + begin=Timestamp("2009-02-24 00:00:02"), + ), + DummyFile( + path=Path("bar"), + begin=Timestamp("2009-02-24 00:00:04"), + ), + ], + Timestamp("2009-02-24 00:00:00"), + Timestamp("2009-02-24 00:00:10"), + Timedelta(seconds=2), + 2 / 10, + id="empty-items-before-and-after-files", + ), + ], +) +def test_populated_duration_and_ratio( + files: list[DummyFile], + begin: Timestamp, + end: Timestamp, + expected_pop_duration: Timedelta, + expected_pop_ratio: float, +) -> None: + dummy_data = DummyData.from_files(files=files, begin=begin, end=end) + assert dummy_data.populated_duration == expected_pop_duration + assert np.isclose(dummy_data.populated_ratio, expected_pop_ratio) From 2f08f3af1bf4a9465515b78b3cdebe36e192e48a Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Mon, 26 Jan 2026 16:21:56 +0100 Subject: [PATCH 05/12] add SpectroData.populated_duration test --- tests/test_spectro.py | 45 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tests/test_spectro.py b/tests/test_spectro.py index fc23e14c..4490ba7a 100644 --- a/tests/test_spectro.py +++ b/tests/test_spectro.py @@ -12,6 +12,7 @@ from pandas import Timedelta, Timestamp from scipy.signal import ShortTimeFFT from scipy.signal.windows import hamming +from test_core_api_base import DummyFile from osekit.config import ( TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED, @@ -20,6 +21,7 @@ from osekit.core_api.audio_data import AudioData from osekit.core_api.audio_dataset import AudioDataset from osekit.core_api.audio_file import AudioFile +from osekit.core_api.base_data import BaseData from osekit.core_api.event import Event from osekit.core_api.frequency_scale import Scale, ScalePart from osekit.core_api.instrument import Instrument @@ -1516,3 +1518,46 @@ def mocked_read_metadata(self: SpectroFile, *args: list, **kwargs: dict) -> None with pytest.raises(ValueError, match=r"Items don't have the same time resolution."): SpectroData([si1, si3]).get_value() + + +def test_spectro_populated_duration_ratio( + patch_audio_data: None, + monkeypatch: pytest.MonkeyPatch, +) -> None: + ad_sentinel = object() + bd_sentinel = object() + + monkeypatch.setattr( + AudioData, + "populated_duration", + property(lambda _: ad_sentinel), + ) + monkeypatch.setattr(BaseData, "populated_duration", property(lambda _: bd_sentinel)) + + sft = ShortTimeFFT(hamming(512), hop=128, fs=48_000) + + # SD with file(s) should return the file population + monkeypatch.setattr( + SpectroData, + "files", + property( + lambda _: [DummyFile(path="foo", begin=Timestamp("2009-02-24 00:00:00"))], + ), + ) + assert ( + SpectroData.from_audio_data( + data=AudioData(mocked_value=[0, 1, 2]), + fft=sft, + ).populated_duration + == bd_sentinel + ) + + # SD without files should return the file population + monkeypatch.setattr(SpectroData, "files", property(lambda _: None)) + assert ( + SpectroData.from_audio_data( + data=AudioData(mocked_value=[0, 1, 2]), + fft=sft, + ).populated_duration + == ad_sentinel + ) From 25e4495694c1661e4f50f7337b848756ceb55629 Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Mon, 26 Jan 2026 16:22:10 +0100 Subject: [PATCH 06/12] add SpectroData.populated_duration test --- tests/test_spectro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_spectro.py b/tests/test_spectro.py index 4490ba7a..53da8e10 100644 --- a/tests/test_spectro.py +++ b/tests/test_spectro.py @@ -1520,7 +1520,7 @@ def mocked_read_metadata(self: SpectroFile, *args: list, **kwargs: dict) -> None SpectroData([si1, si3]).get_value() -def test_spectro_populated_duration_ratio( +def test_spectro_populated_duration( patch_audio_data: None, monkeypatch: pytest.MonkeyPatch, ) -> None: From e1e62bcfcf1c53107fff227554ce4af179ae3fb8 Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Mon, 26 Jan 2026 16:25:17 +0100 Subject: [PATCH 07/12] add SpectroData.populated_duration test case with no ad nor file --- src/osekit/core_api/spectro_data.py | 3 ++- tests/test_spectro.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/osekit/core_api/spectro_data.py b/src/osekit/core_api/spectro_data.py index 297867ac..627d3528 100644 --- a/src/osekit/core_api/spectro_data.py +++ b/src/osekit/core_api/spectro_data.py @@ -18,6 +18,7 @@ import numpy as np import pandas as pd from matplotlib.dates import date2num +from pandas import Timedelta from scipy.signal import ShortTimeFFT, welch from osekit.config import ( @@ -31,7 +32,7 @@ if TYPE_CHECKING: from pathlib import Path - from pandas import Timedelta, Timestamp + from pandas import Timestamp from osekit.core_api.frequency_scale import Scale diff --git a/tests/test_spectro.py b/tests/test_spectro.py index 53da8e10..1432b11c 100644 --- a/tests/test_spectro.py +++ b/tests/test_spectro.py @@ -1561,3 +1561,11 @@ def test_spectro_populated_duration( ).populated_duration == ad_sentinel ) + + # SD With no files or audio data return 0. + sd = SpectroData.from_audio_data( + data=AudioData(mocked_value=[0, 1, 2]), + fft=sft, + ) + sd.audio_data = None + assert sd.populated_duration == Timedelta(0.0) From 1d7ce2bc6b7b0006051630d023a70c629d14a7e2 Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Mon, 26 Jan 2026 16:55:29 +0100 Subject: [PATCH 08/12] Move Dummy* core API test classes to new test helper module --- tests/helpers/__init__.py | 0 tests/helpers/dummy.py | 106 ++++++++++++++++++++++++++++++++ tests/test_core_api_base.py | 116 +++--------------------------------- tests/test_spectro.py | 2 +- 4 files changed, 115 insertions(+), 109 deletions(-) create mode 100644 tests/helpers/__init__.py create mode 100644 tests/helpers/dummy.py diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/helpers/dummy.py b/tests/helpers/dummy.py new file mode 100644 index 00000000..26c7e11b --- /dev/null +++ b/tests/helpers/dummy.py @@ -0,0 +1,106 @@ +import typing +from pathlib import Path +from typing import Self + +import numpy as np +from pandas import Timestamp + +from osekit.core_api.base_data import BaseData, TFile +from osekit.core_api.base_dataset import BaseDataset, TData +from osekit.core_api.base_file import BaseFile +from osekit.core_api.base_item import BaseItem + + +class DummyFile(BaseFile): + supported_extensions: typing.ClassVar = [""] + + def read(self, start: Timestamp, stop: Timestamp) -> np.ndarray: ... + + +class DummyItem(BaseItem[DummyFile]): ... + + +class DummyData(BaseData[DummyItem, DummyFile]): + item_cls = DummyItem + + def write(self, folder: Path, *, link: bool = False) -> None: ... + + def link(self, folder: Path) -> None: ... + + def _make_split_data( + self, + files: list[DummyFile], + begin: Timestamp, + end: Timestamp, + **kwargs, # noqa: ANN003 + ) -> Self: + return DummyData.from_files(files=files, begin=begin, end=end, **kwargs) + + @classmethod + def _make_file(cls, path: Path, begin: Timestamp) -> DummyFile: + return DummyFile(path=path, begin=begin) + + @classmethod + def _make_item( + cls, + file: TFile | None = None, + begin: Timestamp | None = None, + end: Timestamp | None = None, + ) -> DummyItem: + return DummyItem(file=file, begin=begin, end=end) + + @classmethod + def _from_base_dict( + cls, + dictionary: dict, + files: list[TFile], + begin: Timestamp, + end: Timestamp, + **kwargs, # noqa: ANN003 + ) -> Self: + return cls.from_files( + files=files, + begin=begin, + end=end, + ) + + @classmethod + def from_files( + cls, + files: list[DummyFile], + begin: Timestamp | None = None, + end: Timestamp | None = None, + name: str | None = None, + **kwargs, # noqa: ANN003 + ) -> Self: + return super().from_files( + files=files, + begin=begin, + end=end, + name=name, + **kwargs, + ) + + +class DummyDataset(BaseDataset[DummyData, DummyFile]): + @classmethod + def _data_from_dict(cls, dictionary: dict) -> list[TData]: + return [DummyData.from_dict(data) for data in dictionary.values()] + + @classmethod + def _data_from_files( + cls, + files: list[DummyFile], + begin: Timestamp | None = None, + end: Timestamp | None = None, + name: str | None = None, + **kwargs, + ) -> TData: + return DummyData.from_files( + files=files, + begin=begin, + end=end, + name=name, + ) + + file_cls = DummyFile diff --git a/tests/test_core_api_base.py b/tests/test_core_api_base.py index b7e1d64e..7f6303bc 100644 --- a/tests/test_core_api_base.py +++ b/tests/test_core_api_base.py @@ -1,115 +1,15 @@ from __future__ import annotations -import typing from pathlib import Path -from typing import Literal, Self +from typing import Literal import numpy as np -import pandas as pd import pytest -from pandas import Timedelta, Timestamp +from pandas import Timedelta, Timestamp, date_range from osekit.config import TIMESTAMP_FORMATS_EXPORTED_FILES -from osekit.core_api.base_data import BaseData, TFile -from osekit.core_api.base_dataset import BaseDataset, TData -from osekit.core_api.base_file import BaseFile -from osekit.core_api.base_item import BaseItem from osekit.core_api.event import Event - - -class DummyFile(BaseFile): - supported_extensions: typing.ClassVar = [""] - - def read(self, start: Timestamp, stop: Timestamp) -> np.ndarray: ... - - -class DummyItem(BaseItem[DummyFile]): ... - - -class DummyData(BaseData[DummyItem, DummyFile]): - item_cls = DummyItem - - def write(self, folder: Path, *, link: bool = False) -> None: ... - - def link(self, folder: Path) -> None: ... - - def _make_split_data( - self, - files: list[DummyFile], - begin: Timestamp, - end: Timestamp, - **kwargs, # noqa: ANN003 - ) -> Self: - return DummyData.from_files(files=files, begin=begin, end=end, **kwargs) - - @classmethod - def _make_file(cls, path: Path, begin: Timestamp) -> DummyFile: - return DummyFile(path=path, begin=begin) - - @classmethod - def _make_item( - cls, - file: TFile | None = None, - begin: Timestamp | None = None, - end: Timestamp | None = None, - ) -> DummyItem: - return DummyItem(file=file, begin=begin, end=end) - - @classmethod - def _from_base_dict( - cls, - dictionary: dict, - files: list[TFile], - begin: Timestamp, - end: Timestamp, - **kwargs, # noqa: ANN003 - ) -> Self: - return cls.from_files( - files=files, - begin=begin, - end=end, - ) - - @classmethod - def from_files( - cls, - files: list[DummyFile], - begin: Timestamp | None = None, - end: Timestamp | None = None, - name: str | None = None, - **kwargs, # noqa: ANN003 - ) -> Self: - return super().from_files( - files=files, - begin=begin, - end=end, - name=name, - **kwargs, - ) - - -class DummyDataset(BaseDataset[DummyData, DummyFile]): - @classmethod - def _data_from_dict(cls, dictionary: dict) -> list[TData]: - return [DummyData.from_dict(data) for data in dictionary.values()] - - @classmethod - def _data_from_files( - cls, - files: list[DummyFile], - begin: Timestamp | None = None, - end: Timestamp | None = None, - name: str | None = None, - **kwargs, - ) -> TData: - return DummyData.from_files( - files=files, - begin=begin, - end=end, - name=name, - ) - - file_cls = DummyFile +from tests.helpers.dummy import DummyData, DummyDataset, DummyFile @pytest.fixture @@ -117,14 +17,14 @@ def dummy_dataset(tmp_path: Path) -> DummyDataset: files = [tmp_path / f"file_{i}.txt" for i in range(5)] for file in files: file.touch() - timestamps = pd.date_range( - start=pd.Timestamp("2000-01-01 00:00:00"), + timestamps = date_range( + start=Timestamp("2000-01-01 00:00:00"), freq="1s", periods=5, ) dfs = [ - DummyFile(path=file, begin=timestamp, end=timestamp + pd.Timedelta(seconds=1)) + DummyFile(path=file, begin=timestamp, end=timestamp + Timedelta(seconds=1)) for file, timestamp in zip(files, timestamps, strict=False) ] return DummyDataset.from_files(files=dfs, mode="files") @@ -1122,7 +1022,7 @@ def test_dataset_move( ], ) def test_base_dataset_file_mode( - tmp_path: pytest.fixture, + tmp_path: Path, files: list[DummyFile], mode: Literal["files", "timedelta_total"], data_duration: Timedelta | None, @@ -1291,7 +1191,7 @@ def test_base_dataset_file_mode( ], ) def test_base_data_boundaries( - monkeypatch: pytest.fixture, + monkeypatch: pytest.MonkeyPatch, files: list[DummyFile], begin: Timestamp, end: Timestamp, diff --git a/tests/test_spectro.py b/tests/test_spectro.py index 1432b11c..c221ba90 100644 --- a/tests/test_spectro.py +++ b/tests/test_spectro.py @@ -12,7 +12,6 @@ from pandas import Timedelta, Timestamp from scipy.signal import ShortTimeFFT from scipy.signal.windows import hamming -from test_core_api_base import DummyFile from osekit.config import ( TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED, @@ -32,6 +31,7 @@ from osekit.core_api.spectro_file import SpectroFile from osekit.core_api.spectro_item import SpectroItem from osekit.utils.audio_utils import Normalization, generate_sample_audio +from tests.helpers.dummy import DummyFile @pytest.mark.parametrize( From f148c78494690552bbf612daadb9c9bee3750746 Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Tue, 27 Jan 2026 10:35:19 +0100 Subject: [PATCH 09/12] add BaseDataset.remove_empty_data() method --- src/osekit/core_api/base_dataset.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/osekit/core_api/base_dataset.py b/src/osekit/core_api/base_dataset.py index 3008ee8e..720d3a7f 100644 --- a/src/osekit/core_api/base_dataset.py +++ b/src/osekit/core_api/base_dataset.py @@ -170,6 +170,22 @@ def data_duration(self) -> Timedelta: ] return max(set(data_durations), key=data_durations.count) + def remove_empty_data(self, threshold: float = 0.0) -> None: + """Remove data that has less than ``threshold`` % of non-empty duration. + + Parameters + ---------- + threshold: float + Threshold percentage of emptiness duration under which the + data should be removed. + Must be in the ``[0.,1.]`` interval. + + """ + if not 0.0 <= threshold <= 1.0: + msg = f"Threshold should be between 0 and 1. Got {threshold}" + raise ValueError(msg) + self.data = [data for data in self.data if data.populated_ratio > threshold] + def write( self, folder: Path, From c4870ce03b747d6a48967e222cb6ea6c35e0d6d5 Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Tue, 27 Jan 2026 14:42:20 +0100 Subject: [PATCH 10/12] add BaseDataset.remove_empty_data tests --- tests/test_core_api_base.py | 66 +++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/tests/test_core_api_base.py b/tests/test_core_api_base.py index 7f6303bc..9d7e9919 100644 --- a/tests/test_core_api_base.py +++ b/tests/test_core_api_base.py @@ -2488,3 +2488,69 @@ def test_populated_duration_and_ratio( dummy_data = DummyData.from_files(files=files, begin=begin, end=end) assert dummy_data.populated_duration == expected_pop_duration assert np.isclose(dummy_data.populated_ratio, expected_pop_ratio) + + +@pytest.mark.parametrize( + ("data_populated_ratios", "threshold", "expected_kept_data_idx"), + [ + pytest.param( + [1.0], + 0.0, + [0], + id="one_kept_data", + ), + pytest.param( + [0.0], + 1.0, + [], + id="one_rejected_data", + ), + pytest.param( + [0.0, 1.0], + 0.5, + [1], + id="one_kept_one_rejected", + ), + pytest.param( + [1.0, 1.0, 0.8, 0.6, 0.7], + 0.65, + [0, 1, 2, 4], + id="all_kept_but_one", + ), + pytest.param( + [0.49, 0.5, 0.51], + 0.5, + [2], + id="threshold_is_exclusive", + ), + ], +) +def test_dataset_remove_empty_data( + monkeypatch: pytest.MonkeyPatch, + data_populated_ratios: list[float], + threshold: float, + expected_kept_data_idx: list[int], +) -> None: + monkeypatch.setattr( + DummyData, + "populated_ratio", + property(lambda d: d._populated_ratio), + ) + + data = [] + for ratio in data_populated_ratios: + d = DummyData.from_files( + [DummyFile(path=Path(r"bruit"), begin=Timestamp("2021-04-02 00:00:00"))], + ) + d._populated_ratio = ratio + data.append(d) + + expected_kept_data = [ + d for idx, d in enumerate(data) if idx in expected_kept_data_idx + ] + + ds = DummyDataset(data) + + ds.remove_empty_data(threshold=threshold) + + assert np.array_equal(ds.data, expected_kept_data) From 52f0e305c2f2cabb8f747e1a1642e2abbdf6c846 Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Tue, 27 Jan 2026 14:44:16 +0100 Subject: [PATCH 11/12] add BaseDataset.remove_empty_data() threshold error tests --- tests/test_core_api_base.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_core_api_base.py b/tests/test_core_api_base.py index 9d7e9919..155e240a 100644 --- a/tests/test_core_api_base.py +++ b/tests/test_core_api_base.py @@ -2554,3 +2554,13 @@ def test_dataset_remove_empty_data( ds.remove_empty_data(threshold=threshold) assert np.array_equal(ds.data, expected_kept_data) + + +def test_dataset_remove_empty_data_threshold_errors() -> None: + ds = DummyDataset(data=[]) + + with pytest.raises(ValueError, match=r"Threshold should be between 0 and 1."): + ds.remove_empty_data(threshold=-0.5) + + with pytest.raises(ValueError, match=r"Threshold should be between 0 and 1."): + ds.remove_empty_data(threshold=1.5) From 3fe889fc711e8c178d84f81834993a2ba93c6c04 Mon Sep 17 00:00:00 2001 From: Gautzilla Date: Tue, 27 Jan 2026 15:55:49 +0100 Subject: [PATCH 12/12] add remove_empty_data() examples in doc --- docs/source/coreapi_usage.rst | 2 +- .../example_multiple_spectrograms_core.ipynb | 2 +- ...example_multiple_spectrograms_public.ipynb | 50 ++----------------- ...xample_reshaping_multiple_files_core.ipynb | 4 +- ...mple_reshaping_multiple_files_public.ipynb | 2 +- docs/source/publicapi_usage.rst | 2 +- 6 files changed, 10 insertions(+), 52 deletions(-) diff --git a/docs/source/coreapi_usage.rst b/docs/source/coreapi_usage.rst index 7ca2c4b6..baa0a73d 100644 --- a/docs/source/coreapi_usage.rst +++ b/docs/source/coreapi_usage.rst @@ -294,7 +294,7 @@ field: .. code-block:: python # Filtering the ads data to remove data without audio (e.g. between files) - ads.data = [ad for ad in ads.data if not ad.is_empty] + ads.remove_empty_data(threshold=0.) # Resampling/Exporting only the first audio data ad = ads.data[0] diff --git a/docs/source/example_multiple_spectrograms_core.ipynb b/docs/source/example_multiple_spectrograms_core.ipynb index 25e28005..a53a1ad0 100644 --- a/docs/source/example_multiple_spectrograms_core.ipynb +++ b/docs/source/example_multiple_spectrograms_core.ipynb @@ -110,7 +110,7 @@ ")\n", "\n", "# Remove the empty data by using the default AudioDataset constructor:\n", - "audio_dataset = AudioDataset([ad for ad in audio_dataset.data if not ad.is_empty])" + "audio_dataset.remove_empty_data(threshold=0.0)" ] }, { diff --git a/docs/source/example_multiple_spectrograms_public.ipynb b/docs/source/example_multiple_spectrograms_public.ipynb index 47a3477d..29372f1a 100644 --- a/docs/source/example_multiple_spectrograms_public.ipynb +++ b/docs/source/example_multiple_spectrograms_public.ipynb @@ -2,13 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "dc7ebca70b3b5da", "metadata": { - "ExecuteTime": { - "end_time": "2025-11-13T11:25:15.629114Z", - "start_time": "2025-11-13T11:25:15.535616Z" - }, "tags": [ "remove-cell" ] @@ -50,50 +46,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "bb002105fc9632e8", "metadata": { - "ExecuteTime": { - "end_time": "2025-11-13T11:25:21.374824Z", - "start_time": "2025-11-13T11:25:18.067636Z" - }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\t2025-11-13 12:25:20,650\n", - "Building the dataset...\n", - "\n", - "\t2025-11-13 12:25:20,652\n", - "Analyzing original audio files...\n", - "\n", - "\t2025-11-13 12:25:20,680\n", - "Organizing dataset folder...\n", - "\n" - ] - }, - { - "ename": "PermissionError", - "evalue": "[WinError 32] Le processus ne peut pas accéder au fichier car ce fichier est utilisé par un autre processus: '_static\\\\sample_audio\\\\sample_220925_223530.wav' -> '_static\\\\sample_audio\\\\data\\\\audio\\\\original\\\\sample_220925_223530.wav'", - "output_type": "error", - "traceback": [ - "\u001B[31m---------------------------------------------------------------------------\u001B[39m", - "\u001B[31mPermissionError\u001B[39m Traceback (most recent call last)", - "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[2]\u001B[39m\u001B[32m, line 14\u001B[39m\n\u001B[32m 6\u001B[39m \u001B[38;5;28;01mfrom\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34;01mosekit\u001B[39;00m\u001B[34;01m.\u001B[39;00m\u001B[34;01mcore_api\u001B[39;00m\u001B[34;01m.\u001B[39;00m\u001B[34;01minstrument\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mimport\u001B[39;00m Instrument\n\u001B[32m 8\u001B[39m dataset = Dataset(\n\u001B[32m 9\u001B[39m folder=audio_folder,\n\u001B[32m 10\u001B[39m strptime_format=\u001B[33m\"\u001B[39m\u001B[33m%\u001B[39m\u001B[33my\u001B[39m\u001B[33m%\u001B[39m\u001B[33mm\u001B[39m\u001B[38;5;132;01m%d\u001B[39;00m\u001B[33m_\u001B[39m\u001B[33m%\u001B[39m\u001B[33mH\u001B[39m\u001B[33m%\u001B[39m\u001B[33mM\u001B[39m\u001B[33m%\u001B[39m\u001B[33mS\u001B[39m\u001B[33m\"\u001B[39m,\n\u001B[32m 11\u001B[39m instrument=Instrument(end_to_end_db=\u001B[32m150.0\u001B[39m),\n\u001B[32m 12\u001B[39m )\n\u001B[32m---> \u001B[39m\u001B[32m14\u001B[39m \u001B[43mdataset\u001B[49m\u001B[43m.\u001B[49m\u001B[43mbuild\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n", - "\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\public_api\\dataset.py:156\u001B[39m, in \u001B[36mDataset.build\u001B[39m\u001B[34m(self)\u001B[39m\n\u001B[32m 144\u001B[39m \u001B[38;5;28mself\u001B[39m.logger.info(\u001B[33m\"\u001B[39m\u001B[33mOrganizing dataset folder...\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 145\u001B[39m move_tree(\n\u001B[32m 146\u001B[39m source=\u001B[38;5;28mself\u001B[39m.folder,\n\u001B[32m 147\u001B[39m destination=\u001B[38;5;28mself\u001B[39m.folder / \u001B[33m\"\u001B[39m\u001B[33mother\u001B[39m\u001B[33m\"\u001B[39m,\n\u001B[32m (...)\u001B[39m\u001B[32m 154\u001B[39m | {\u001B[38;5;28mself\u001B[39m.folder / \u001B[33m\"\u001B[39m\u001B[33mlog\u001B[39m\u001B[33m\"\u001B[39m},\n\u001B[32m 155\u001B[39m )\n\u001B[32m--> \u001B[39m\u001B[32m156\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_sort_dataset\u001B[49m\u001B[43m(\u001B[49m\u001B[43mads\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 157\u001B[39m ads.write_json(ads.folder)\n\u001B[32m 158\u001B[39m \u001B[38;5;28mself\u001B[39m.write_json()\n", - "\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\public_api\\dataset.py:513\u001B[39m, in \u001B[36mDataset._sort_dataset\u001B[39m\u001B[34m(self, dataset)\u001B[39m\n\u001B[32m 511\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34m_sort_dataset\u001B[39m(\u001B[38;5;28mself\u001B[39m, dataset: \u001B[38;5;28mtype\u001B[39m[DatasetChild]) -> \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m 512\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mtype\u001B[39m(dataset) \u001B[38;5;129;01mis\u001B[39;00m AudioDataset:\n\u001B[32m--> \u001B[39m\u001B[32m513\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_sort_audio_dataset\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdataset\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 514\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m\n\u001B[32m 515\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mtype\u001B[39m(dataset) \u001B[38;5;129;01mis\u001B[39;00m SpectroDataset | LTASDataset:\n", - "\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\public_api\\dataset.py:520\u001B[39m, in \u001B[36mDataset._sort_audio_dataset\u001B[39m\u001B[34m(self, dataset)\u001B[39m\n\u001B[32m 519\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34m_sort_audio_dataset\u001B[39m(\u001B[38;5;28mself\u001B[39m, dataset: AudioDataset) -> \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m--> \u001B[39m\u001B[32m520\u001B[39m \u001B[43mdataset\u001B[49m\u001B[43m.\u001B[49m\u001B[43mmove_files\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_get_audio_dataset_subpath\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdataset\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\n", - "\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\core_api\\base_dataset.py:152\u001B[39m, in \u001B[36mBaseDataset.move_files\u001B[39m\u001B[34m(self, folder)\u001B[39m\n\u001B[32m 143\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"Move the dataset files to the destination folder.\u001B[39;00m\n\u001B[32m 144\u001B[39m \n\u001B[32m 145\u001B[39m \u001B[33;03mParameters\u001B[39;00m\n\u001B[32m (...)\u001B[39m\u001B[32m 149\u001B[39m \n\u001B[32m 150\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 151\u001B[39m \u001B[38;5;28;01mfor\u001B[39;00m file \u001B[38;5;129;01min\u001B[39;00m tqdm(\u001B[38;5;28mself\u001B[39m.files, disable=os.environ.get(\u001B[33m\"\u001B[39m\u001B[33mDISABLE_TQDM\u001B[39m\u001B[33m\"\u001B[39m, \u001B[33m\"\u001B[39m\u001B[33m\"\u001B[39m)):\n\u001B[32m--> \u001B[39m\u001B[32m152\u001B[39m \u001B[43mfile\u001B[49m\u001B[43m.\u001B[49m\u001B[43mmove\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfolder\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 153\u001B[39m \u001B[38;5;28mself\u001B[39m._folder = folder\n", - "\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\core_api\\audio_file.py:128\u001B[39m, in \u001B[36mAudioFile.move\u001B[39m\u001B[34m(self, folder)\u001B[39m\n\u001B[32m 119\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"Move the file to the target folder.\u001B[39;00m\n\u001B[32m 120\u001B[39m \n\u001B[32m 121\u001B[39m \u001B[33;03mParameters\u001B[39;00m\n\u001B[32m (...)\u001B[39m\u001B[32m 125\u001B[39m \n\u001B[32m 126\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 127\u001B[39m afm.close()\n\u001B[32m--> \u001B[39m\u001B[32m128\u001B[39m \u001B[38;5;28;43msuper\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43mmove\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfolder\u001B[49m\u001B[43m)\u001B[49m\n", - "\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\core_api\\base_file.py:171\u001B[39m, in \u001B[36mBaseFile.move\u001B[39m\u001B[34m(self, folder)\u001B[39m\n\u001B[32m 169\u001B[39m destination_path = folder / \u001B[38;5;28mself\u001B[39m.path.name\n\u001B[32m 170\u001B[39m folder.mkdir(exist_ok=\u001B[38;5;28;01mTrue\u001B[39;00m, parents=\u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[32m--> \u001B[39m\u001B[32m171\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mpath\u001B[49m\u001B[43m.\u001B[49m\u001B[43mrename\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdestination_path\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 172\u001B[39m \u001B[38;5;28mself\u001B[39m.path = destination_path\n", - "\u001B[36mFile \u001B[39m\u001B[32m~\\AppData\\Roaming\\uv\\python\\cpython-3.13.3-windows-x86_64-none\\Lib\\pathlib\\_local.py:767\u001B[39m, in \u001B[36mPath.rename\u001B[39m\u001B[34m(self, target)\u001B[39m\n\u001B[32m 757\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34mrename\u001B[39m(\u001B[38;5;28mself\u001B[39m, target):\n\u001B[32m 758\u001B[39m \u001B[38;5;250m \u001B[39m\u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 759\u001B[39m \u001B[33;03m Rename this path to the target path.\u001B[39;00m\n\u001B[32m 760\u001B[39m \n\u001B[32m (...)\u001B[39m\u001B[32m 765\u001B[39m \u001B[33;03m Returns the new Path instance pointing to the target path.\u001B[39;00m\n\u001B[32m 766\u001B[39m \u001B[33;03m \"\"\"\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m767\u001B[39m \u001B[43mos\u001B[49m\u001B[43m.\u001B[49m\u001B[43mrename\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtarget\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 768\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m.with_segments(target)\n", - "\u001B[31mPermissionError\u001B[39m: [WinError 32] Le processus ne peut pas accéder au fichier car ce fichier est utilisé par un autre processus: '_static\\\\sample_audio\\\\sample_220925_223530.wav' -> '_static\\\\sample_audio\\\\data\\\\audio\\\\original\\\\sample_220925_223530.wav'" - ] - } - ], + "outputs": [], "source": [ "from pathlib import Path\n", "\n", @@ -225,7 +183,7 @@ "audio_dataset = dataset.get_analysis_audiodataset(analysis=analysis)\n", "\n", "# Filter the returned AudioDataset\n", - "audio_dataset.data = [ad for ad in audio_dataset.data if not ad.is_empty]" + "audio_dataset.remove_empty_data(threshold=0.0)" ] }, { diff --git a/docs/source/example_reshaping_multiple_files_core.ipynb b/docs/source/example_reshaping_multiple_files_core.ipynb index 5d43c013..7d80b57d 100644 --- a/docs/source/example_reshaping_multiple_files_core.ipynb +++ b/docs/source/example_reshaping_multiple_files_core.ipynb @@ -105,8 +105,8 @@ " f\"{'Nb of Empty data:':<30}{str(len([ad for ad in audio_dataset.data if ad.is_empty])):>30}\\n\"\n", ")\n", "\n", - "# Remove the empty data by using the default AudioDataset constructor:\n", - "audio_dataset = AudioDataset([ad for ad in audio_dataset.data if not ad.is_empty])" + "# Remove the empty data:\n", + "audio_dataset.remove_empty_data(threshold=0.0)" ] }, { diff --git a/docs/source/example_reshaping_multiple_files_public.ipynb b/docs/source/example_reshaping_multiple_files_public.ipynb index caf14afd..4ac5b78f 100644 --- a/docs/source/example_reshaping_multiple_files_public.ipynb +++ b/docs/source/example_reshaping_multiple_files_public.ipynb @@ -150,7 +150,7 @@ "audio_dataset = dataset.get_analysis_audiodataset(analysis=analysis)\n", "\n", "# Filter the returned AudioDataset\n", - "audio_dataset.data = [ad for ad in audio_dataset.data if not ad.is_empty]" + "audio_dataset.remove_empty_data(threshold=0.0)" ] }, { diff --git a/docs/source/publicapi_usage.rst b/docs/source/publicapi_usage.rst index fe5b2dc3..85fa55df 100644 --- a/docs/source/publicapi_usage.rst +++ b/docs/source/publicapi_usage.rst @@ -186,7 +186,7 @@ The returned ``AudioDataset`` can be edited at will and passed as a parameter la ads = dataset.get_analysis_audiodataset(analysis=analysis) # Filtering out the AudioData that are not linked to any audio file: - ads.data = [ad for ad in ads.data if not ad.is_empty] + ads.remove_empty_data(threshold=0.) The returned ``SpectroDataset`` can be used e.g. to plot sample spectrograms prior to the analysis: