From a2b9e6f3d0938582fa9229e6db1621a39271b63e Mon Sep 17 00:00:00 2001
From: Gautzilla <gauthier.berthomieu@gmail.com>
Date: Mon, 26 Jan 2026 15:51:15 +0100
Subject: [PATCH 01/12] add populated_duration property

---
 src/osekit/core_api/base_data.py    |  7 ++++++-
 src/osekit/core_api/spectro_data.py | 15 ++++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/osekit/core_api/base_data.py b/src/osekit/core_api/base_data.py
index 0406deab..a39895db 100644
--- a/src/osekit/core_api/base_data.py
+++ b/src/osekit/core_api/base_data.py
@@ -12,7 +12,7 @@
 from typing import Self, TypeVar
 
 import numpy as np
-from pandas import Timestamp, date_range
+from pandas import Timedelta, Timestamp, date_range
 
 from osekit.config import (
     DPDEFAULT,
@@ -129,6 +129,11 @@ def end(self, value: Timestamp) -> None:
         for item in self.items:
             item.end = min(item.end, value)
 
+    @property
+    def populated_duration(self) -> Timedelta:
+        """Total duration of the non-empty parts of the data."""
+        return Timedelta(sum(item.duration for item in self.items if not item.is_empty))
+
     def get_value(self) -> np.ndarray:
         """Get the concatenated values from all Items."""
         return np.concatenate([item.get_value() for item in self.items])
diff --git a/src/osekit/core_api/spectro_data.py b/src/osekit/core_api/spectro_data.py
index c9c32841..297867ac 100644
--- a/src/osekit/core_api/spectro_data.py
+++ b/src/osekit/core_api/spectro_data.py
@@ -31,7 +31,7 @@
 if TYPE_CHECKING:
     from pathlib import Path
 
-    from pandas import Timestamp
+    from pandas import Timedelta, Timestamp
 
     from osekit.core_api.frequency_scale import Scale
 
@@ -248,6 +248,19 @@ def v_lim(self, v_lim: tuple[float, float] | None) -> None:
             v_lim = (-120.0, 0.0) if self.db_type == "FS" else (0.0, 170.0)
         self._v_lim = v_lim
 
+    @property
+    def populated_duration(self) -> Timedelta:
+        """Override BaseData.populated_duration.
+
+        If the SpectroData has no associated file, it will return the
+        populated duration of the associated AudioData.
+        """
+        if self.files:
+            return super().populated_duration
+        if not self.audio_data:
+            return Timedelta(0)
+        return self.audio_data.populated_duration
+
     def get_value(self) -> np.ndarray:
         """Return the Sx matrix of the spectrogram.
 

From 25dffa4de4d690b749579fabd9376e2af3ff7946 Mon Sep 17 00:00:00 2001
From: Gautzilla <gauthier.berthomieu@gmail.com>
Date: Mon, 26 Jan 2026 15:51:34 +0100
Subject: [PATCH 02/12] add populated_ratio property

---
 src/osekit/core_api/base_data.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/osekit/core_api/base_data.py b/src/osekit/core_api/base_data.py
index a39895db..ea2f9667 100644
--- a/src/osekit/core_api/base_data.py
+++ b/src/osekit/core_api/base_data.py
@@ -134,6 +134,11 @@ def populated_duration(self) -> Timedelta:
         """Total duration of the non-empty parts of the data."""
         return Timedelta(sum(item.duration for item in self.items if not item.is_empty))
 
+    @property
+    def populated_ratio(self) -> float:
+        """Percentage of the non-empty parts of the data."""
+        return self.populated_duration / self.duration
+
     def get_value(self) -> np.ndarray:
         """Get the concatenated values from all Items."""
         return np.concatenate([item.get_value() for item in self.items])

From 95c4216feff74be3ec067b599796b61bb67952bd Mon Sep 17 00:00:00 2001
From: Gautzilla <gauthier.berthomieu@gmail.com>
Date: Mon, 26 Jan 2026 16:02:19 +0100
Subject: [PATCH 03/12] add sum function start Timedelta(0)

---
 src/osekit/core_api/base_data.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/osekit/core_api/base_data.py b/src/osekit/core_api/base_data.py
index ea2f9667..5846c9a7 100644
--- a/src/osekit/core_api/base_data.py
+++ b/src/osekit/core_api/base_data.py
@@ -132,7 +132,12 @@ def end(self, value: Timestamp) -> None:
     @property
     def populated_duration(self) -> Timedelta:
         """Total duration of the non-empty parts of the data."""
-        return Timedelta(sum(item.duration for item in self.items if not item.is_empty))
+        return Timedelta(
+            sum(
+                (item.duration for item in self.items if not item.is_empty),
+                start=Timedelta(0),
+            ),
+        )
 
     @property
     def populated_ratio(self) -> float:

From 8f834526660da2c372e5717241b37d9970f9c806 Mon Sep 17 00:00:00 2001
From: Gautzilla <gauthier.berthomieu@gmail.com>
Date: Mon, 26 Jan 2026 16:08:48 +0100
Subject: [PATCH 04/12] add populated duration and ratio tests

---
 tests/test_core_api_base.py | 107 ++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)

diff --git a/tests/test_core_api_base.py b/tests/test_core_api_base.py
index c394f292..b7e1d64e 100644
--- a/tests/test_core_api_base.py
+++ b/tests/test_core_api_base.py
@@ -2481,3 +2481,110 @@ def test_dummydataset_data_from_dict() -> None:
         )[0]
         == dd1
     )
+
+
+@pytest.mark.parametrize(
+    ("files", "begin", "end", "expected_pop_duration", "expected_pop_ratio"),
+    [
+        pytest.param(
+            [
+                DummyFile(
+                    path=Path("foo"),
+                    begin=Timestamp("2009-02-24 00:00:00"),
+                ),
+            ],
+            Timestamp("2009-02-24 00:00:00"),
+            Timestamp("2009-02-24 00:00:01"),
+            Timedelta(seconds=1),
+            1.0,
+            id="one-full-file",
+        ),
+        pytest.param(
+            [
+                DummyFile(
+                    path=Path("foo"),
+                    begin=Timestamp("2009-02-24 00:00:00"),
+                ),
+            ],
+            Timestamp("2009-02-24 00:00:00.4"),
+            Timestamp("2009-02-24 00:00:00.6"),
+            Timedelta(seconds=0.2),
+            1.0,
+            id="one-full-file-part",
+        ),
+        pytest.param(
+            [
+                DummyFile(
+                    path=Path("foo"),
+                    begin=Timestamp("2009-02-24 00:00:00"),
+                ),
+            ],
+            Timestamp("2009-02-24 00:00:00.5"),
+            Timestamp("2009-02-24 00:00:01.5"),
+            Timedelta(seconds=0.5),
+            0.5,
+            id="one-file-part-with-empty-item",
+        ),
+        pytest.param(
+            [
+                DummyFile(
+                    path=Path("foo"),
+                    begin=Timestamp("2009-02-24 00:00:00"),
+                ),
+                DummyFile(
+                    path=Path("bar"),
+                    begin=Timestamp("2009-02-24 00:00:01"),
+                ),
+            ],
+            Timestamp("2009-02-24 00:00:00"),
+            Timestamp("2009-02-24 00:00:02"),
+            Timedelta(seconds=2),
+            1,
+            id="two-full-consecutive-files",
+        ),
+        pytest.param(
+            [
+                DummyFile(
+                    path=Path("foo"),
+                    begin=Timestamp("2009-02-24 00:00:00"),
+                ),
+                DummyFile(
+                    path=Path("bar"),
+                    begin=Timestamp("2009-02-24 00:00:02"),
+                ),
+            ],
+            Timestamp("2009-02-24 00:00:00"),
+            Timestamp("2009-02-24 00:00:03"),
+            Timedelta(seconds=2),
+            2 / 3,
+            id="two-full-files-with-empty-gap",
+        ),
+        pytest.param(
+            [
+                DummyFile(
+                    path=Path("foo"),
+                    begin=Timestamp("2009-02-24 00:00:02"),
+                ),
+                DummyFile(
+                    path=Path("bar"),
+                    begin=Timestamp("2009-02-24 00:00:04"),
+                ),
+            ],
+            Timestamp("2009-02-24 00:00:00"),
+            Timestamp("2009-02-24 00:00:10"),
+            Timedelta(seconds=2),
+            2 / 10,
+            id="empty-items-before-and-after-files",
+        ),
+    ],
+)
+def test_populated_duration_and_ratio(
+    files: list[DummyFile],
+    begin: Timestamp,
+    end: Timestamp,
+    expected_pop_duration: Timedelta,
+    expected_pop_ratio: float,
+) -> None:
+    dummy_data = DummyData.from_files(files=files, begin=begin, end=end)
+    assert dummy_data.populated_duration == expected_pop_duration
+    assert np.isclose(dummy_data.populated_ratio, expected_pop_ratio)

From 2f08f3af1bf4a9465515b78b3cdebe36e192e48a Mon Sep 17 00:00:00 2001
From: Gautzilla <gauthier.berthomieu@gmail.com>
Date: Mon, 26 Jan 2026 16:21:56 +0100
Subject: [PATCH 05/12] add SpectroData.populated_duration test

---
 tests/test_spectro.py | 45 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tests/test_spectro.py b/tests/test_spectro.py
index fc23e14c..4490ba7a 100644
--- a/tests/test_spectro.py
+++ b/tests/test_spectro.py
@@ -12,6 +12,7 @@
 from pandas import Timedelta, Timestamp
 from scipy.signal import ShortTimeFFT
 from scipy.signal.windows import hamming
+from test_core_api_base import DummyFile
 
 from osekit.config import (
     TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED,
@@ -20,6 +21,7 @@
 from osekit.core_api.audio_data import AudioData
 from osekit.core_api.audio_dataset import AudioDataset
 from osekit.core_api.audio_file import AudioFile
+from osekit.core_api.base_data import BaseData
 from osekit.core_api.event import Event
 from osekit.core_api.frequency_scale import Scale, ScalePart
 from osekit.core_api.instrument import Instrument
@@ -1516,3 +1518,46 @@ def mocked_read_metadata(self: SpectroFile, *args: list, **kwargs: dict) -> None
 
     with pytest.raises(ValueError, match=r"Items don't have the same time resolution."):
         SpectroData([si1, si3]).get_value()
+
+
+def test_spectro_populated_duration_ratio(
+    patch_audio_data: None,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    ad_sentinel = object()
+    bd_sentinel = object()
+
+    monkeypatch.setattr(
+        AudioData,
+        "populated_duration",
+        property(lambda _: ad_sentinel),
+    )
+    monkeypatch.setattr(BaseData, "populated_duration", property(lambda _: bd_sentinel))
+
+    sft = ShortTimeFFT(hamming(512), hop=128, fs=48_000)
+
+    # SD with file(s) should return the file population
+    monkeypatch.setattr(
+        SpectroData,
+        "files",
+        property(
+            lambda _: [DummyFile(path="foo", begin=Timestamp("2009-02-24 00:00:00"))],
+        ),
+    )
+    assert (
+        SpectroData.from_audio_data(
+            data=AudioData(mocked_value=[0, 1, 2]),
+            fft=sft,
+        ).populated_duration
+        == bd_sentinel
+    )
+
+    # SD without files should return the file population
+    monkeypatch.setattr(SpectroData, "files", property(lambda _: None))
+    assert (
+        SpectroData.from_audio_data(
+            data=AudioData(mocked_value=[0, 1, 2]),
+            fft=sft,
+        ).populated_duration
+        == ad_sentinel
+    )

From 25e4495694c1661e4f50f7337b848756ceb55629 Mon Sep 17 00:00:00 2001
From: Gautzilla <gauthier.berthomieu@gmail.com>
Date: Mon, 26 Jan 2026 16:22:10 +0100
Subject: [PATCH 06/12] add SpectroData.populated_duration test

---
 tests/test_spectro.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_spectro.py b/tests/test_spectro.py
index 4490ba7a..53da8e10 100644
--- a/tests/test_spectro.py
+++ b/tests/test_spectro.py
@@ -1520,7 +1520,7 @@ def mocked_read_metadata(self: SpectroFile, *args: list, **kwargs: dict) -> None
         SpectroData([si1, si3]).get_value()
 
 
-def test_spectro_populated_duration_ratio(
+def test_spectro_populated_duration(
     patch_audio_data: None,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:

From e1e62bcfcf1c53107fff227554ce4af179ae3fb8 Mon Sep 17 00:00:00 2001
From: Gautzilla <gauthier.berthomieu@gmail.com>
Date: Mon, 26 Jan 2026 16:25:17 +0100
Subject: [PATCH 07/12] add SpectroData.populated_duration test case with no ad
 nor file

---
 src/osekit/core_api/spectro_data.py | 3 ++-
 tests/test_spectro.py               | 8 ++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/osekit/core_api/spectro_data.py b/src/osekit/core_api/spectro_data.py
index 297867ac..627d3528 100644
--- a/src/osekit/core_api/spectro_data.py
+++ b/src/osekit/core_api/spectro_data.py
@@ -18,6 +18,7 @@
 import numpy as np
 import pandas as pd
 from matplotlib.dates import date2num
+from pandas import Timedelta
 from scipy.signal import ShortTimeFFT, welch
 
 from osekit.config import (
@@ -31,7 +32,7 @@
 if TYPE_CHECKING:
     from pathlib import Path
 
-    from pandas import Timedelta, Timestamp
+    from pandas import Timestamp
 
     from osekit.core_api.frequency_scale import Scale
 
diff --git a/tests/test_spectro.py b/tests/test_spectro.py
index 53da8e10..1432b11c 100644
--- a/tests/test_spectro.py
+++ b/tests/test_spectro.py
@@ -1561,3 +1561,11 @@ def test_spectro_populated_duration(
         ).populated_duration
         == ad_sentinel
     )
+
+    # SD With no files or audio data return 0.
+    sd = SpectroData.from_audio_data(
+        data=AudioData(mocked_value=[0, 1, 2]),
+        fft=sft,
+    )
+    sd.audio_data = None
+    assert sd.populated_duration == Timedelta(0.0)

From 1d7ce2bc6b7b0006051630d023a70c629d14a7e2 Mon Sep 17 00:00:00 2001
From: Gautzilla <gauthier.berthomieu@gmail.com>
Date: Mon, 26 Jan 2026 16:55:29 +0100
Subject: [PATCH 08/12] Move Dummy* core API test classes to new test helper
 module

---
 tests/helpers/__init__.py   |   0
 tests/helpers/dummy.py      | 106 ++++++++++++++++++++++++++++++++
 tests/test_core_api_base.py | 116 +++---------------------------------
 tests/test_spectro.py       |   2 +-
 4 files changed, 115 insertions(+), 109 deletions(-)
 create mode 100644 tests/helpers/__init__.py
 create mode 100644 tests/helpers/dummy.py

diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/helpers/dummy.py b/tests/helpers/dummy.py
new file mode 100644
index 00000000..26c7e11b
--- /dev/null
+++ b/tests/helpers/dummy.py
@@ -0,0 +1,106 @@
+import typing
+from pathlib import Path
+from typing import Self
+
+import numpy as np
+from pandas import Timestamp
+
+from osekit.core_api.base_data import BaseData, TFile
+from osekit.core_api.base_dataset import BaseDataset, TData
+from osekit.core_api.base_file import BaseFile
+from osekit.core_api.base_item import BaseItem
+
+
+class DummyFile(BaseFile):
+    supported_extensions: typing.ClassVar = [""]
+
+    def read(self, start: Timestamp, stop: Timestamp) -> np.ndarray: ...
+
+
+class DummyItem(BaseItem[DummyFile]): ...
+
+
+class DummyData(BaseData[DummyItem, DummyFile]):
+    item_cls = DummyItem
+
+    def write(self, folder: Path, *, link: bool = False) -> None: ...
+
+    def link(self, folder: Path) -> None: ...
+
+    def _make_split_data(
+        self,
+        files: list[DummyFile],
+        begin: Timestamp,
+        end: Timestamp,
+        **kwargs,  # noqa: ANN003
+    ) -> Self:
+        return DummyData.from_files(files=files, begin=begin, end=end, **kwargs)
+
+    @classmethod
+    def _make_file(cls, path: Path, begin: Timestamp) -> DummyFile:
+        return DummyFile(path=path, begin=begin)
+
+    @classmethod
+    def _make_item(
+        cls,
+        file: TFile | None = None,
+        begin: Timestamp | None = None,
+        end: Timestamp | None = None,
+    ) -> DummyItem:
+        return DummyItem(file=file, begin=begin, end=end)
+
+    @classmethod
+    def _from_base_dict(
+        cls,
+        dictionary: dict,
+        files: list[TFile],
+        begin: Timestamp,
+        end: Timestamp,
+        **kwargs,  # noqa: ANN003
+    ) -> Self:
+        return cls.from_files(
+            files=files,
+            begin=begin,
+            end=end,
+        )
+
+    @classmethod
+    def from_files(
+        cls,
+        files: list[DummyFile],
+        begin: Timestamp | None = None,
+        end: Timestamp | None = None,
+        name: str | None = None,
+        **kwargs,  # noqa: ANN003
+    ) -> Self:
+        return super().from_files(
+            files=files,
+            begin=begin,
+            end=end,
+            name=name,
+            **kwargs,
+        )
+
+
+class DummyDataset(BaseDataset[DummyData, DummyFile]):
+    @classmethod
+    def _data_from_dict(cls, dictionary: dict) -> list[TData]:
+        return [DummyData.from_dict(data) for data in dictionary.values()]
+
+    @classmethod
+    def _data_from_files(
+        cls,
+        files: list[DummyFile],
+        begin: Timestamp | None = None,
+        end: Timestamp | None = None,
+        name: str | None = None,
+        **kwargs,
+    ) -> TData:
+        return DummyData.from_files(
+            files=files,
+            begin=begin,
+            end=end,
+            name=name,
+        )
+
+    file_cls = DummyFile
diff --git a/tests/test_core_api_base.py b/tests/test_core_api_base.py
index b7e1d64e..7f6303bc 100644
--- a/tests/test_core_api_base.py
+++ b/tests/test_core_api_base.py
@@ -1,115 +1,15 @@
 from __future__ import annotations
 
-import typing
 from pathlib import Path
-from typing import Literal, Self
+from typing import Literal
 
 import numpy as np
-import pandas as pd
 import pytest
-from pandas import Timedelta, Timestamp
+from pandas import Timedelta, Timestamp, date_range
 
 from osekit.config import TIMESTAMP_FORMATS_EXPORTED_FILES
-from osekit.core_api.base_data import BaseData, TFile
-from osekit.core_api.base_dataset import BaseDataset, TData
-from osekit.core_api.base_file import BaseFile
-from osekit.core_api.base_item import BaseItem
 from osekit.core_api.event import Event
-
-
-class DummyFile(BaseFile):
-    supported_extensions: typing.ClassVar = [""]
-
-    def read(self, start: Timestamp, stop: Timestamp) -> np.ndarray: ...
-
-
-class DummyItem(BaseItem[DummyFile]): ...
-
-
-class DummyData(BaseData[DummyItem, DummyFile]):
-    item_cls = DummyItem
-
-    def write(self, folder: Path, *, link: bool = False) -> None: ...
-
-    def link(self, folder: Path) -> None: ...
-
-    def _make_split_data(
-        self,
-        files: list[DummyFile],
-        begin: Timestamp,
-        end: Timestamp,
-        **kwargs,  # noqa: ANN003
-    ) -> Self:
-        return DummyData.from_files(files=files, begin=begin, end=end, **kwargs)
-
-    @classmethod
-    def _make_file(cls, path: Path, begin: Timestamp) -> DummyFile:
-        return DummyFile(path=path, begin=begin)
-
-    @classmethod
-    def _make_item(
-        cls,
-        file: TFile | None = None,
-        begin: Timestamp | None = None,
-        end: Timestamp | None = None,
-    ) -> DummyItem:
-        return DummyItem(file=file, begin=begin, end=end)
-
-    @classmethod
-    def _from_base_dict(
-        cls,
-        dictionary: dict,
-        files: list[TFile],
-        begin: Timestamp,
-        end: Timestamp,
-        **kwargs,  # noqa: ANN003
-    ) -> Self:
-        return cls.from_files(
-            files=files,
-            begin=begin,
-            end=end,
-        )
-
-    @classmethod
-    def from_files(
-        cls,
-        files: list[DummyFile],
-        begin: Timestamp | None = None,
-        end: Timestamp | None = None,
-        name: str | None = None,
-        **kwargs,  # noqa: ANN003
-    ) -> Self:
-        return super().from_files(
-            files=files,
-            begin=begin,
-            end=end,
-            name=name,
-            **kwargs,
-        )
-
-
-class DummyDataset(BaseDataset[DummyData, DummyFile]):
-    @classmethod
-    def _data_from_dict(cls, dictionary: dict) -> list[TData]:
-        return [DummyData.from_dict(data) for data in dictionary.values()]
-
-    @classmethod
-    def _data_from_files(
-        cls,
-        files: list[DummyFile],
-        begin: Timestamp | None = None,
-        end: Timestamp | None = None,
-        name: str | None = None,
-        **kwargs,
-    ) -> TData:
-        return DummyData.from_files(
-            files=files,
-            begin=begin,
-            end=end,
-            name=name,
-        )
-
-    file_cls = DummyFile
+from tests.helpers.dummy import DummyData, DummyDataset, DummyFile
 
 
 @pytest.fixture
@@ -117,14 +17,14 @@ def dummy_dataset(tmp_path: Path) -> DummyDataset:
     files = [tmp_path / f"file_{i}.txt" for i in range(5)]
     for file in files:
         file.touch()
-    timestamps = pd.date_range(
-        start=pd.Timestamp("2000-01-01 00:00:00"),
+    timestamps = date_range(
+        start=Timestamp("2000-01-01 00:00:00"),
         freq="1s",
         periods=5,
     )
 
     dfs = [
-        DummyFile(path=file, begin=timestamp, end=timestamp + pd.Timedelta(seconds=1))
+        DummyFile(path=file, begin=timestamp, end=timestamp + Timedelta(seconds=1))
         for file, timestamp in zip(files, timestamps, strict=False)
     ]
     return DummyDataset.from_files(files=dfs, mode="files")
@@ -1122,7 +1022,7 @@ def test_dataset_move(
     ],
 )
 def test_base_dataset_file_mode(
-    tmp_path: pytest.fixture,
+    tmp_path: Path,
     files: list[DummyFile],
     mode: Literal["files", "timedelta_total"],
     data_duration: Timedelta | None,
@@ -1291,7 +1191,7 @@ def test_base_dataset_file_mode(
     ],
 )
 def test_base_data_boundaries(
-    monkeypatch: pytest.fixture,
+    monkeypatch: pytest.MonkeyPatch,
     files: list[DummyFile],
     begin: Timestamp,
     end: Timestamp,
diff --git a/tests/test_spectro.py b/tests/test_spectro.py
index 1432b11c..c221ba90 100644
--- a/tests/test_spectro.py
+++ b/tests/test_spectro.py
@@ -12,7 +12,6 @@
 from pandas import Timedelta, Timestamp
 from scipy.signal import ShortTimeFFT
 from scipy.signal.windows import hamming
-from test_core_api_base import DummyFile
 
 from osekit.config import (
     TIMESTAMP_FORMAT_EXPORTED_FILES_UNLOCALIZED,
@@ -32,6 +31,7 @@
 from osekit.core_api.spectro_file import SpectroFile
 from osekit.core_api.spectro_item import SpectroItem
 from osekit.utils.audio_utils import Normalization, generate_sample_audio
+from tests.helpers.dummy import DummyFile
 
 
 @pytest.mark.parametrize(

From f148c78494690552bbf612daadb9c9bee3750746 Mon Sep 17 00:00:00 2001
From: Gautzilla <gauthier.berthomieu@gmail.com>
Date: Tue, 27 Jan 2026 10:35:19 +0100
Subject: [PATCH 09/12] add BaseDataset.remove_empty_data() method

---
 src/osekit/core_api/base_dataset.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/osekit/core_api/base_dataset.py b/src/osekit/core_api/base_dataset.py
index 3008ee8e..720d3a7f 100644
--- a/src/osekit/core_api/base_dataset.py
+++ b/src/osekit/core_api/base_dataset.py
@@ -170,6 +170,22 @@ def data_duration(self) -> Timedelta:
         ]
         return max(set(data_durations), key=data_durations.count)
 
+    def remove_empty_data(self, threshold: float = 0.0) -> None:
+        """Remove data that has less than ``threshold`` % of non-empty duration.
+
+        Parameters
+        ----------
+        threshold: float
+            Threshold percentage of emptiness duration under which the
+            data should be removed.
+            Must be in the ``[0.,1.]`` interval.
+
+        """
+        if not 0.0 <= threshold <= 1.0:
+            msg = f"Threshold should be between 0 and 1. Got {threshold}"
+            raise ValueError(msg)
+        self.data = [data for data in self.data if data.populated_ratio > threshold]
+
     def write(
         self,
         folder: Path,

From c4870ce03b747d6a48967e222cb6ea6c35e0d6d5 Mon Sep 17 00:00:00 2001
From: Gautzilla <gauthier.berthomieu@gmail.com>
Date: Tue, 27 Jan 2026 14:42:20 +0100
Subject: [PATCH 10/12] add BaseDataset.remove_empty_data tests

---
 tests/test_core_api_base.py | 66 +++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/tests/test_core_api_base.py b/tests/test_core_api_base.py
index 7f6303bc..9d7e9919 100644
--- a/tests/test_core_api_base.py
+++ b/tests/test_core_api_base.py
@@ -2488,3 +2488,69 @@ def test_populated_duration_and_ratio(
     dummy_data = DummyData.from_files(files=files, begin=begin, end=end)
     assert dummy_data.populated_duration == expected_pop_duration
     assert np.isclose(dummy_data.populated_ratio, expected_pop_ratio)
+
+
+@pytest.mark.parametrize(
+    ("data_populated_ratios", "threshold", "expected_kept_data_idx"),
+    [
+        pytest.param(
+            [1.0],
+            0.0,
+            [0],
+            id="one_kept_data",
+        ),
+        pytest.param(
+            [0.0],
+            1.0,
+            [],
+            id="one_rejected_data",
+        ),
+        pytest.param(
+            [0.0, 1.0],
+            0.5,
+            [1],
+            id="one_kept_one_rejected",
+        ),
+        pytest.param(
+            [1.0, 1.0, 0.8, 0.6, 0.7],
+            0.65,
+            [0, 1, 2, 4],
+            id="all_kept_but_one",
+        ),
+        pytest.param(
+            [0.49, 0.5, 0.51],
+            0.5,
+            [2],
+            id="threshold_is_exclusive",
+        ),
+    ],
+)
+def test_dataset_remove_empty_data(
+    monkeypatch: pytest.MonkeyPatch,
+    data_populated_ratios: list[float],
+    threshold: float,
+    expected_kept_data_idx: list[int],
+) -> None:
+    monkeypatch.setattr(
+        DummyData,
+        "populated_ratio",
+        property(lambda d: d._populated_ratio),
+    )
+
+    data = []
+    for ratio in data_populated_ratios:
+        d = DummyData.from_files(
+            [DummyFile(path=Path(r"bruit"), begin=Timestamp("2021-04-02 00:00:00"))],
+        )
+        d._populated_ratio = ratio
+        data.append(d)
+
+    expected_kept_data = [
+        d for idx, d in enumerate(data) if idx in expected_kept_data_idx
+    ]
+
+    ds = DummyDataset(data)
+
+    ds.remove_empty_data(threshold=threshold)
+
+    assert np.array_equal(ds.data, expected_kept_data)

From 52f0e305c2f2cabb8f747e1a1642e2abbdf6c846 Mon Sep 17 00:00:00 2001
From: Gautzilla <gauthier.berthomieu@gmail.com>
Date: Tue, 27 Jan 2026 14:44:16 +0100
Subject: [PATCH 11/12] add BaseDataset.remove_empty_data() threshold error
 tests

---
 tests/test_core_api_base.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/test_core_api_base.py b/tests/test_core_api_base.py
index 9d7e9919..155e240a 100644
--- a/tests/test_core_api_base.py
+++ b/tests/test_core_api_base.py
@@ -2554,3 +2554,13 @@ def test_dataset_remove_empty_data(
     ds.remove_empty_data(threshold=threshold)
 
     assert np.array_equal(ds.data, expected_kept_data)
+
+
+def test_dataset_remove_empty_data_threshold_errors() -> None:
+    ds = DummyDataset(data=[])
+
+    with pytest.raises(ValueError, match=r"Threshold should be between 0 and 1."):
+        ds.remove_empty_data(threshold=-0.5)
+
+    with pytest.raises(ValueError, match=r"Threshold should be between 0 and 1."):
+        ds.remove_empty_data(threshold=1.5)

From 3fe889fc711e8c178d84f81834993a2ba93c6c04 Mon Sep 17 00:00:00 2001
From: Gautzilla <gauthier.berthomieu@gmail.com>
Date: Tue, 27 Jan 2026 15:55:49 +0100
Subject: [PATCH 12/12] add remove_empty_data() examples in doc

---
 docs/source/coreapi_usage.rst                 |  2 +-
 .../example_multiple_spectrograms_core.ipynb  |  2 +-
 ...example_multiple_spectrograms_public.ipynb | 50 ++-----------------
 ...xample_reshaping_multiple_files_core.ipynb |  4 +-
 ...mple_reshaping_multiple_files_public.ipynb |  2 +-
 docs/source/publicapi_usage.rst               |  2 +-
 6 files changed, 10 insertions(+), 52 deletions(-)

diff --git a/docs/source/coreapi_usage.rst b/docs/source/coreapi_usage.rst
index 7ca2c4b6..baa0a73d 100644
--- a/docs/source/coreapi_usage.rst
+++ b/docs/source/coreapi_usage.rst
@@ -294,7 +294,7 @@ field:
 .. code-block:: python
 
     # Filtering the ads data to remove data without audio (e.g. between files)
-    ads.data = [ad for ad in ads.data if not ad.is_empty]
+    ads.remove_empty_data(threshold=0.)
 
     # Resampling/Exporting only the first audio data
     ad = ads.data[0]
diff --git a/docs/source/example_multiple_spectrograms_core.ipynb b/docs/source/example_multiple_spectrograms_core.ipynb
index 25e28005..a53a1ad0 100644
--- a/docs/source/example_multiple_spectrograms_core.ipynb
+++ b/docs/source/example_multiple_spectrograms_core.ipynb
@@ -110,7 +110,7 @@
     ")\n",
     "\n",
     "# Remove the empty data by using the default AudioDataset constructor:\n",
-    "audio_dataset = AudioDataset([ad for ad in audio_dataset.data if not ad.is_empty])"
+    "audio_dataset.remove_empty_data(threshold=0.0)"
    ]
   },
   {
diff --git a/docs/source/example_multiple_spectrograms_public.ipynb b/docs/source/example_multiple_spectrograms_public.ipynb
index 47a3477d..29372f1a 100644
--- a/docs/source/example_multiple_spectrograms_public.ipynb
+++ b/docs/source/example_multiple_spectrograms_public.ipynb
@@ -2,13 +2,9 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "dc7ebca70b3b5da",
    "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-11-13T11:25:15.629114Z",
-     "start_time": "2025-11-13T11:25:15.535616Z"
-    },
     "tags": [
      "remove-cell"
     ]
@@ -50,50 +46,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "bb002105fc9632e8",
    "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-11-13T11:25:21.374824Z",
-     "start_time": "2025-11-13T11:25:18.067636Z"
-    },
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\t2025-11-13 12:25:20,650\n",
-      "Building the dataset...\n",
-      "\n",
-      "\t2025-11-13 12:25:20,652\n",
-      "Analyzing original audio files...\n",
-      "\n",
-      "\t2025-11-13 12:25:20,680\n",
-      "Organizing dataset folder...\n",
-      "\n"
-     ]
-    },
-    {
-     "ename": "PermissionError",
-     "evalue": "[WinError 32] Le processus ne peut pas accéder au fichier car ce fichier est utilisé par un autre processus: '_static\\\\sample_audio\\\\sample_220925_223530.wav' -> '_static\\\\sample_audio\\\\data\\\\audio\\\\original\\\\sample_220925_223530.wav'",
-     "output_type": "error",
-     "traceback": [
-      "\u001B[31m---------------------------------------------------------------------------\u001B[39m",
-      "\u001B[31mPermissionError\u001B[39m                           Traceback (most recent call last)",
-      "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[2]\u001B[39m\u001B[32m, line 14\u001B[39m\n\u001B[32m      6\u001B[39m \u001B[38;5;28;01mfrom\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34;01mosekit\u001B[39;00m\u001B[34;01m.\u001B[39;00m\u001B[34;01mcore_api\u001B[39;00m\u001B[34;01m.\u001B[39;00m\u001B[34;01minstrument\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mimport\u001B[39;00m Instrument\n\u001B[32m      8\u001B[39m dataset = Dataset(\n\u001B[32m      9\u001B[39m     folder=audio_folder,\n\u001B[32m     10\u001B[39m     strptime_format=\u001B[33m\"\u001B[39m\u001B[33m%\u001B[39m\u001B[33my\u001B[39m\u001B[33m%\u001B[39m\u001B[33mm\u001B[39m\u001B[38;5;132;01m%d\u001B[39;00m\u001B[33m_\u001B[39m\u001B[33m%\u001B[39m\u001B[33mH\u001B[39m\u001B[33m%\u001B[39m\u001B[33mM\u001B[39m\u001B[33m%\u001B[39m\u001B[33mS\u001B[39m\u001B[33m\"\u001B[39m,\n\u001B[32m     11\u001B[39m     instrument=Instrument(end_to_end_db=\u001B[32m150.0\u001B[39m),\n\u001B[32m     12\u001B[39m )\n\u001B[32m---> \u001B[39m\u001B[32m14\u001B[39m \u001B[43mdataset\u001B[49m\u001B[43m.\u001B[49m\u001B[43mbuild\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\public_api\\dataset.py:156\u001B[39m, in \u001B[36mDataset.build\u001B[39m\u001B[34m(self)\u001B[39m\n\u001B[32m    144\u001B[39m \u001B[38;5;28mself\u001B[39m.logger.info(\u001B[33m\"\u001B[39m\u001B[33mOrganizing dataset folder...\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m    145\u001B[39m move_tree(\n\u001B[32m    146\u001B[39m     source=\u001B[38;5;28mself\u001B[39m.folder,\n\u001B[32m    147\u001B[39m     destination=\u001B[38;5;28mself\u001B[39m.folder / \u001B[33m\"\u001B[39m\u001B[33mother\u001B[39m\u001B[33m\"\u001B[39m,\n\u001B[32m   (...)\u001B[39m\u001B[32m    154\u001B[39m     | {\u001B[38;5;28mself\u001B[39m.folder / \u001B[33m\"\u001B[39m\u001B[33mlog\u001B[39m\u001B[33m\"\u001B[39m},\n\u001B[32m    155\u001B[39m )\n\u001B[32m--> \u001B[39m\u001B[32m156\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_sort_dataset\u001B[49m\u001B[43m(\u001B[49m\u001B[43mads\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m    157\u001B[39m ads.write_json(ads.folder)\n\u001B[32m    158\u001B[39m \u001B[38;5;28mself\u001B[39m.write_json()\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\public_api\\dataset.py:513\u001B[39m, in \u001B[36mDataset._sort_dataset\u001B[39m\u001B[34m(self, dataset)\u001B[39m\n\u001B[32m    511\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34m_sort_dataset\u001B[39m(\u001B[38;5;28mself\u001B[39m, dataset: \u001B[38;5;28mtype\u001B[39m[DatasetChild]) -> \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m    512\u001B[39m     \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mtype\u001B[39m(dataset) \u001B[38;5;129;01mis\u001B[39;00m AudioDataset:\n\u001B[32m--> \u001B[39m\u001B[32m513\u001B[39m         \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_sort_audio_dataset\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdataset\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m    514\u001B[39m         \u001B[38;5;28;01mreturn\u001B[39;00m\n\u001B[32m    515\u001B[39m     \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mtype\u001B[39m(dataset) \u001B[38;5;129;01mis\u001B[39;00m SpectroDataset | LTASDataset:\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\public_api\\dataset.py:520\u001B[39m, in \u001B[36mDataset._sort_audio_dataset\u001B[39m\u001B[34m(self, dataset)\u001B[39m\n\u001B[32m    519\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34m_sort_audio_dataset\u001B[39m(\u001B[38;5;28mself\u001B[39m, dataset: AudioDataset) -> \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m--> \u001B[39m\u001B[32m520\u001B[39m     \u001B[43mdataset\u001B[49m\u001B[43m.\u001B[49m\u001B[43mmove_files\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_get_audio_dataset_subpath\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdataset\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\core_api\\base_dataset.py:152\u001B[39m, in \u001B[36mBaseDataset.move_files\u001B[39m\u001B[34m(self, folder)\u001B[39m\n\u001B[32m    143\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"Move the dataset files to the destination folder.\u001B[39;00m\n\u001B[32m    144\u001B[39m \n\u001B[32m    145\u001B[39m \u001B[33;03mParameters\u001B[39;00m\n\u001B[32m   (...)\u001B[39m\u001B[32m    149\u001B[39m \n\u001B[32m    150\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m    151\u001B[39m \u001B[38;5;28;01mfor\u001B[39;00m file \u001B[38;5;129;01min\u001B[39;00m tqdm(\u001B[38;5;28mself\u001B[39m.files, disable=os.environ.get(\u001B[33m\"\u001B[39m\u001B[33mDISABLE_TQDM\u001B[39m\u001B[33m\"\u001B[39m, \u001B[33m\"\u001B[39m\u001B[33m\"\u001B[39m)):\n\u001B[32m--> \u001B[39m\u001B[32m152\u001B[39m     \u001B[43mfile\u001B[49m\u001B[43m.\u001B[49m\u001B[43mmove\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfolder\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m    153\u001B[39m \u001B[38;5;28mself\u001B[39m._folder = folder\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\core_api\\audio_file.py:128\u001B[39m, in \u001B[36mAudioFile.move\u001B[39m\u001B[34m(self, folder)\u001B[39m\n\u001B[32m    119\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"Move the file to the target folder.\u001B[39;00m\n\u001B[32m    120\u001B[39m \n\u001B[32m    121\u001B[39m \u001B[33;03mParameters\u001B[39;00m\n\u001B[32m   (...)\u001B[39m\u001B[32m    125\u001B[39m \n\u001B[32m    126\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m    127\u001B[39m afm.close()\n\u001B[32m--> \u001B[39m\u001B[32m128\u001B[39m \u001B[38;5;28;43msuper\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43mmove\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfolder\u001B[49m\u001B[43m)\u001B[49m\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\core_api\\base_file.py:171\u001B[39m, in \u001B[36mBaseFile.move\u001B[39m\u001B[34m(self, folder)\u001B[39m\n\u001B[32m    169\u001B[39m destination_path = folder / \u001B[38;5;28mself\u001B[39m.path.name\n\u001B[32m    170\u001B[39m folder.mkdir(exist_ok=\u001B[38;5;28;01mTrue\u001B[39;00m, parents=\u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[32m--> \u001B[39m\u001B[32m171\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mpath\u001B[49m\u001B[43m.\u001B[49m\u001B[43mrename\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdestination_path\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m    172\u001B[39m \u001B[38;5;28mself\u001B[39m.path = destination_path\n",
-      "\u001B[36mFile \u001B[39m\u001B[32m~\\AppData\\Roaming\\uv\\python\\cpython-3.13.3-windows-x86_64-none\\Lib\\pathlib\\_local.py:767\u001B[39m, in \u001B[36mPath.rename\u001B[39m\u001B[34m(self, target)\u001B[39m\n\u001B[32m    757\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34mrename\u001B[39m(\u001B[38;5;28mself\u001B[39m, target):\n\u001B[32m    758\u001B[39m \u001B[38;5;250m    \u001B[39m\u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m    759\u001B[39m \u001B[33;03m    Rename this path to the target path.\u001B[39;00m\n\u001B[32m    760\u001B[39m \n\u001B[32m   (...)\u001B[39m\u001B[32m    765\u001B[39m \u001B[33;03m    Returns the new Path instance pointing to the target path.\u001B[39;00m\n\u001B[32m    766\u001B[39m \u001B[33;03m    \"\"\"\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m767\u001B[39m     \u001B[43mos\u001B[49m\u001B[43m.\u001B[49m\u001B[43mrename\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtarget\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m    768\u001B[39m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m.with_segments(target)\n",
-      "\u001B[31mPermissionError\u001B[39m: [WinError 32] Le processus ne peut pas accéder au fichier car ce fichier est utilisé par un autre processus: '_static\\\\sample_audio\\\\sample_220925_223530.wav' -> '_static\\\\sample_audio\\\\data\\\\audio\\\\original\\\\sample_220925_223530.wav'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from pathlib import Path\n",
     "\n",
@@ -225,7 +183,7 @@
     "audio_dataset = dataset.get_analysis_audiodataset(analysis=analysis)\n",
     "\n",
     "# Filter the returned AudioDataset\n",
-    "audio_dataset.data = [ad for ad in audio_dataset.data if not ad.is_empty]"
+    "audio_dataset.remove_empty_data(threshold=0.0)"
    ]
   },
   {
diff --git a/docs/source/example_reshaping_multiple_files_core.ipynb b/docs/source/example_reshaping_multiple_files_core.ipynb
index 5d43c013..7d80b57d 100644
--- a/docs/source/example_reshaping_multiple_files_core.ipynb
+++ b/docs/source/example_reshaping_multiple_files_core.ipynb
@@ -105,8 +105,8 @@
     "    f\"{'Nb of Empty data:':<30}{str(len([ad for ad in audio_dataset.data if ad.is_empty])):>30}\\n\"\n",
     ")\n",
     "\n",
-    "# Remove the empty data by using the default AudioDataset constructor:\n",
-    "audio_dataset = AudioDataset([ad for ad in audio_dataset.data if not ad.is_empty])"
+    "# Remove the empty data:\n",
+    "audio_dataset.remove_empty_data(threshold=0.0)"
    ]
   },
   {
diff --git a/docs/source/example_reshaping_multiple_files_public.ipynb b/docs/source/example_reshaping_multiple_files_public.ipynb
index caf14afd..4ac5b78f 100644
--- a/docs/source/example_reshaping_multiple_files_public.ipynb
+++ b/docs/source/example_reshaping_multiple_files_public.ipynb
@@ -150,7 +150,7 @@
     "audio_dataset = dataset.get_analysis_audiodataset(analysis=analysis)\n",
     "\n",
     "# Filter the returned AudioDataset\n",
-    "audio_dataset.data = [ad for ad in audio_dataset.data if not ad.is_empty]"
+    "audio_dataset.remove_empty_data(threshold=0.0)"
    ]
   },
   {
diff --git a/docs/source/publicapi_usage.rst b/docs/source/publicapi_usage.rst
index fe5b2dc3..85fa55df 100644
--- a/docs/source/publicapi_usage.rst
+++ b/docs/source/publicapi_usage.rst
@@ -186,7 +186,7 @@ The returned ``AudioDataset`` can be edited at will and passed as a parameter la
     ads = dataset.get_analysis_audiodataset(analysis=analysis)
 
     # Filtering out the AudioData that are not linked to any audio file:
-    ads.data = [ad for ad in ads.data if not ad.is_empty]
+    ads.remove_empty_data(threshold=0.)
 
 The returned ``SpectroDataset`` can be used e.g. to plot sample spectrograms prior to the analysis: