Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/coreapi_usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ field:
.. code-block:: python

# Filtering the ads data to remove data without audio (e.g. between files)
ads.data = [ad for ad in ads.data if not ad.is_empty]
ads.remove_empty_data(threshold=0.)

# Resampling/Exporting only the first audio data
ad = ads.data[0]
Expand Down
2 changes: 1 addition & 1 deletion docs/source/example_multiple_spectrograms_core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@
")\n",
"\n",
"# Remove the empty data by using the default AudioDataset constructor:\n",
"audio_dataset = AudioDataset([ad for ad in audio_dataset.data if not ad.is_empty])"
"audio_dataset.remove_empty_data(threshold=0.0)"
]
},
{
Expand Down
50 changes: 4 additions & 46 deletions docs/source/example_multiple_spectrograms_public.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,9 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "dc7ebca70b3b5da",
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-13T11:25:15.629114Z",
"start_time": "2025-11-13T11:25:15.535616Z"
},
"tags": [
"remove-cell"
]
Expand Down Expand Up @@ -50,50 +46,12 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "bb002105fc9632e8",
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-13T11:25:21.374824Z",
"start_time": "2025-11-13T11:25:18.067636Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\t2025-11-13 12:25:20,650\n",
"Building the dataset...\n",
"\n",
"\t2025-11-13 12:25:20,652\n",
"Analyzing original audio files...\n",
"\n",
"\t2025-11-13 12:25:20,680\n",
"Organizing dataset folder...\n",
"\n"
]
},
{
"ename": "PermissionError",
"evalue": "[WinError 32] Le processus ne peut pas accéder au fichier car ce fichier est utilisé par un autre processus: '_static\\\\sample_audio\\\\sample_220925_223530.wav' -> '_static\\\\sample_audio\\\\data\\\\audio\\\\original\\\\sample_220925_223530.wav'",
"output_type": "error",
"traceback": [
"\u001B[31m---------------------------------------------------------------------------\u001B[39m",
"\u001B[31mPermissionError\u001B[39m Traceback (most recent call last)",
"\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[2]\u001B[39m\u001B[32m, line 14\u001B[39m\n\u001B[32m 6\u001B[39m \u001B[38;5;28;01mfrom\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34;01mosekit\u001B[39;00m\u001B[34;01m.\u001B[39;00m\u001B[34;01mcore_api\u001B[39;00m\u001B[34;01m.\u001B[39;00m\u001B[34;01minstrument\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mimport\u001B[39;00m Instrument\n\u001B[32m 8\u001B[39m dataset = Dataset(\n\u001B[32m 9\u001B[39m folder=audio_folder,\n\u001B[32m 10\u001B[39m strptime_format=\u001B[33m\"\u001B[39m\u001B[33m%\u001B[39m\u001B[33my\u001B[39m\u001B[33m%\u001B[39m\u001B[33mm\u001B[39m\u001B[38;5;132;01m%d\u001B[39;00m\u001B[33m_\u001B[39m\u001B[33m%\u001B[39m\u001B[33mH\u001B[39m\u001B[33m%\u001B[39m\u001B[33mM\u001B[39m\u001B[33m%\u001B[39m\u001B[33mS\u001B[39m\u001B[33m\"\u001B[39m,\n\u001B[32m 11\u001B[39m instrument=Instrument(end_to_end_db=\u001B[32m150.0\u001B[39m),\n\u001B[32m 12\u001B[39m )\n\u001B[32m---> \u001B[39m\u001B[32m14\u001B[39m \u001B[43mdataset\u001B[49m\u001B[43m.\u001B[49m\u001B[43mbuild\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
"\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\public_api\\dataset.py:156\u001B[39m, in \u001B[36mDataset.build\u001B[39m\u001B[34m(self)\u001B[39m\n\u001B[32m 144\u001B[39m \u001B[38;5;28mself\u001B[39m.logger.info(\u001B[33m\"\u001B[39m\u001B[33mOrganizing dataset folder...\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 145\u001B[39m move_tree(\n\u001B[32m 146\u001B[39m source=\u001B[38;5;28mself\u001B[39m.folder,\n\u001B[32m 147\u001B[39m destination=\u001B[38;5;28mself\u001B[39m.folder / \u001B[33m\"\u001B[39m\u001B[33mother\u001B[39m\u001B[33m\"\u001B[39m,\n\u001B[32m (...)\u001B[39m\u001B[32m 154\u001B[39m | {\u001B[38;5;28mself\u001B[39m.folder / \u001B[33m\"\u001B[39m\u001B[33mlog\u001B[39m\u001B[33m\"\u001B[39m},\n\u001B[32m 155\u001B[39m )\n\u001B[32m--> \u001B[39m\u001B[32m156\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_sort_dataset\u001B[49m\u001B[43m(\u001B[49m\u001B[43mads\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 157\u001B[39m ads.write_json(ads.folder)\n\u001B[32m 158\u001B[39m \u001B[38;5;28mself\u001B[39m.write_json()\n",
"\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\public_api\\dataset.py:513\u001B[39m, in \u001B[36mDataset._sort_dataset\u001B[39m\u001B[34m(self, dataset)\u001B[39m\n\u001B[32m 511\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34m_sort_dataset\u001B[39m(\u001B[38;5;28mself\u001B[39m, dataset: \u001B[38;5;28mtype\u001B[39m[DatasetChild]) -> \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m 512\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mtype\u001B[39m(dataset) \u001B[38;5;129;01mis\u001B[39;00m AudioDataset:\n\u001B[32m--> \u001B[39m\u001B[32m513\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_sort_audio_dataset\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdataset\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 514\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m\n\u001B[32m 515\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mtype\u001B[39m(dataset) \u001B[38;5;129;01mis\u001B[39;00m SpectroDataset | LTASDataset:\n",
"\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\public_api\\dataset.py:520\u001B[39m, in \u001B[36mDataset._sort_audio_dataset\u001B[39m\u001B[34m(self, dataset)\u001B[39m\n\u001B[32m 519\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34m_sort_audio_dataset\u001B[39m(\u001B[38;5;28mself\u001B[39m, dataset: AudioDataset) -> \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[32m--> \u001B[39m\u001B[32m520\u001B[39m \u001B[43mdataset\u001B[49m\u001B[43m.\u001B[49m\u001B[43mmove_files\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43m_get_audio_dataset_subpath\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdataset\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\n",
"\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\core_api\\base_dataset.py:152\u001B[39m, in \u001B[36mBaseDataset.move_files\u001B[39m\u001B[34m(self, folder)\u001B[39m\n\u001B[32m 143\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"Move the dataset files to the destination folder.\u001B[39;00m\n\u001B[32m 144\u001B[39m \n\u001B[32m 145\u001B[39m \u001B[33;03mParameters\u001B[39;00m\n\u001B[32m (...)\u001B[39m\u001B[32m 149\u001B[39m \n\u001B[32m 150\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 151\u001B[39m \u001B[38;5;28;01mfor\u001B[39;00m file \u001B[38;5;129;01min\u001B[39;00m tqdm(\u001B[38;5;28mself\u001B[39m.files, disable=os.environ.get(\u001B[33m\"\u001B[39m\u001B[33mDISABLE_TQDM\u001B[39m\u001B[33m\"\u001B[39m, \u001B[33m\"\u001B[39m\u001B[33m\"\u001B[39m)):\n\u001B[32m--> \u001B[39m\u001B[32m152\u001B[39m \u001B[43mfile\u001B[49m\u001B[43m.\u001B[49m\u001B[43mmove\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfolder\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 153\u001B[39m \u001B[38;5;28mself\u001B[39m._folder = folder\n",
"\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\core_api\\audio_file.py:128\u001B[39m, in \u001B[36mAudioFile.move\u001B[39m\u001B[34m(self, folder)\u001B[39m\n\u001B[32m 119\u001B[39m \u001B[38;5;250m\u001B[39m\u001B[33;03m\"\"\"Move the file to the target folder.\u001B[39;00m\n\u001B[32m 120\u001B[39m \n\u001B[32m 121\u001B[39m \u001B[33;03mParameters\u001B[39;00m\n\u001B[32m (...)\u001B[39m\u001B[32m 125\u001B[39m \n\u001B[32m 126\u001B[39m \u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 127\u001B[39m afm.close()\n\u001B[32m--> \u001B[39m\u001B[32m128\u001B[39m \u001B[38;5;28;43msuper\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[43m.\u001B[49m\u001B[43mmove\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfolder\u001B[49m\u001B[43m)\u001B[49m\n",
"\u001B[36mFile \u001B[39m\u001B[32m~\\Documents\\GitHub\\OSEkit\\src\\osekit\\core_api\\base_file.py:171\u001B[39m, in \u001B[36mBaseFile.move\u001B[39m\u001B[34m(self, folder)\u001B[39m\n\u001B[32m 169\u001B[39m destination_path = folder / \u001B[38;5;28mself\u001B[39m.path.name\n\u001B[32m 170\u001B[39m folder.mkdir(exist_ok=\u001B[38;5;28;01mTrue\u001B[39;00m, parents=\u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[32m--> \u001B[39m\u001B[32m171\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[43m.\u001B[49m\u001B[43mpath\u001B[49m\u001B[43m.\u001B[49m\u001B[43mrename\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdestination_path\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 172\u001B[39m \u001B[38;5;28mself\u001B[39m.path = destination_path\n",
"\u001B[36mFile \u001B[39m\u001B[32m~\\AppData\\Roaming\\uv\\python\\cpython-3.13.3-windows-x86_64-none\\Lib\\pathlib\\_local.py:767\u001B[39m, in \u001B[36mPath.rename\u001B[39m\u001B[34m(self, target)\u001B[39m\n\u001B[32m 757\u001B[39m \u001B[38;5;28;01mdef\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34mrename\u001B[39m(\u001B[38;5;28mself\u001B[39m, target):\n\u001B[32m 758\u001B[39m \u001B[38;5;250m \u001B[39m\u001B[33;03m\"\"\"\u001B[39;00m\n\u001B[32m 759\u001B[39m \u001B[33;03m Rename this path to the target path.\u001B[39;00m\n\u001B[32m 760\u001B[39m \n\u001B[32m (...)\u001B[39m\u001B[32m 765\u001B[39m \u001B[33;03m Returns the new Path instance pointing to the target path.\u001B[39;00m\n\u001B[32m 766\u001B[39m \u001B[33;03m \"\"\"\u001B[39;00m\n\u001B[32m--> \u001B[39m\u001B[32m767\u001B[39m \u001B[43mos\u001B[49m\u001B[43m.\u001B[49m\u001B[43mrename\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtarget\u001B[49m\u001B[43m)\u001B[49m\n\u001B[32m 768\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m.with_segments(target)\n",
"\u001B[31mPermissionError\u001B[39m: [WinError 32] Le processus ne peut pas accéder au fichier car ce fichier est utilisé par un autre processus: '_static\\\\sample_audio\\\\sample_220925_223530.wav' -> '_static\\\\sample_audio\\\\data\\\\audio\\\\original\\\\sample_220925_223530.wav'"
]
}
],
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
Expand Down Expand Up @@ -225,7 +183,7 @@
"audio_dataset = dataset.get_analysis_audiodataset(analysis=analysis)\n",
"\n",
"# Filter the returned AudioDataset\n",
"audio_dataset.data = [ad for ad in audio_dataset.data if not ad.is_empty]"
"audio_dataset.remove_empty_data(threshold=0.0)"
]
},
{
Expand Down
4 changes: 2 additions & 2 deletions docs/source/example_reshaping_multiple_files_core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@
" f\"{'Nb of Empty data:':<30}{str(len([ad for ad in audio_dataset.data if ad.is_empty])):>30}\\n\"\n",
")\n",
"\n",
"# Remove the empty data by using the default AudioDataset constructor:\n",
"audio_dataset = AudioDataset([ad for ad in audio_dataset.data if not ad.is_empty])"
"# Remove the empty data:\n",
"audio_dataset.remove_empty_data(threshold=0.0)"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion docs/source/example_reshaping_multiple_files_public.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@
"audio_dataset = dataset.get_analysis_audiodataset(analysis=analysis)\n",
"\n",
"# Filter the returned AudioDataset\n",
"audio_dataset.data = [ad for ad in audio_dataset.data if not ad.is_empty]"
"audio_dataset.remove_empty_data(threshold=0.0)"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion docs/source/publicapi_usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ The returned ``AudioDataset`` can be edited at will and passed as a parameter la
ads = dataset.get_analysis_audiodataset(analysis=analysis)

# Filtering out the AudioData that are not linked to any audio file:
ads.data = [ad for ad in ads.data if not ad.is_empty]
ads.remove_empty_data(threshold=0.)

The returned ``SpectroDataset`` can be used e.g. to plot sample spectrograms prior to the analysis:

Expand Down
17 changes: 16 additions & 1 deletion src/osekit/core_api/base_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from typing import Self, TypeVar

import numpy as np
from pandas import Timestamp, date_range
from pandas import Timedelta, Timestamp, date_range

from osekit.config import (
DPDEFAULT,
Expand Down Expand Up @@ -129,6 +129,21 @@ def end(self, value: Timestamp) -> None:
for item in self.items:
item.end = min(item.end, value)

@property
def populated_duration(self) -> Timedelta:
"""Total duration of the non-empty parts of the data."""
return Timedelta(
sum(
(item.duration for item in self.items if not item.is_empty),
start=Timedelta(0),
),
)

@property
def populated_ratio(self) -> float:
"""Percentage of the non-empty parts of the data."""
return self.populated_duration / self.duration

def get_value(self) -> np.ndarray:
"""Get the concatenated values from all Items."""
return np.concatenate([item.get_value() for item in self.items])
Expand Down
16 changes: 16 additions & 0 deletions src/osekit/core_api/base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,22 @@ def data_duration(self) -> Timedelta:
]
return max(set(data_durations), key=data_durations.count)

def remove_empty_data(self, threshold: float = 0.0) -> None:
"""Remove data that has less than ``threshold`` % of non-empty duration.

Parameters
----------
threshold: float
Threshold percentage of emptiness duration under which the
data should be removed.
Must be in the ``[0.,1.]`` interval.

"""
if not 0.0 <= threshold <= 1.0:
msg = f"Threshold should be between 0 and 1. Got {threshold}"
raise ValueError(msg)
self.data = [data for data in self.data if data.populated_ratio > threshold]

def write(
self,
folder: Path,
Expand Down
14 changes: 14 additions & 0 deletions src/osekit/core_api/spectro_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import numpy as np
import pandas as pd
from matplotlib.dates import date2num
from pandas import Timedelta
from scipy.signal import ShortTimeFFT, welch

from osekit.config import (
Expand Down Expand Up @@ -248,6 +249,19 @@ def v_lim(self, v_lim: tuple[float, float] | None) -> None:
v_lim = (-120.0, 0.0) if self.db_type == "FS" else (0.0, 170.0)
self._v_lim = v_lim

@property
def populated_duration(self) -> Timedelta:
"""Override BaseData.populated_duration.

If the SpectroData has no associated file, it will return the
populated duration of the associated AudioData.
"""
if self.files:
return super().populated_duration
if not self.audio_data:
return Timedelta(0)
return self.audio_data.populated_duration

def get_value(self) -> np.ndarray:
"""Return the Sx matrix of the spectrogram.

Expand Down
Empty file added tests/helpers/__init__.py
Empty file.
106 changes: 106 additions & 0 deletions tests/helpers/dummy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import typing
from pathlib import Path
from typing import Self

import numpy as np
from pandas import Timestamp

from osekit.core_api.base_data import BaseData, TFile
from osekit.core_api.base_dataset import BaseDataset, TData
from osekit.core_api.base_file import BaseFile
from osekit.core_api.base_item import BaseItem


class DummyFile(BaseFile):
supported_extensions: typing.ClassVar = [""]

def read(self, start: Timestamp, stop: Timestamp) -> np.ndarray: ...


class DummyItem(BaseItem[DummyFile]): ...


class DummyData(BaseData[DummyItem, DummyFile]):
item_cls = DummyItem

def write(self, folder: Path, *, link: bool = False) -> None: ...

def link(self, folder: Path) -> None: ...

def _make_split_data(
self,
files: list[DummyFile],
begin: Timestamp,
end: Timestamp,
**kwargs, # noqa: ANN003
) -> Self:
return DummyData.from_files(files=files, begin=begin, end=end, **kwargs)

@classmethod
def _make_file(cls, path: Path, begin: Timestamp) -> DummyFile:
return DummyFile(path=path, begin=begin)

@classmethod
def _make_item(
cls,
file: TFile | None = None,
begin: Timestamp | None = None,
end: Timestamp | None = None,
) -> DummyItem:
return DummyItem(file=file, begin=begin, end=end)

@classmethod
def _from_base_dict(
cls,
dictionary: dict,
files: list[TFile],
begin: Timestamp,
end: Timestamp,
**kwargs, # noqa: ANN003
) -> Self:
return cls.from_files(
files=files,
begin=begin,
end=end,
)

@classmethod
def from_files(
cls,
files: list[DummyFile],
begin: Timestamp | None = None,
end: Timestamp | None = None,
name: str | None = None,
**kwargs, # noqa: ANN003
) -> Self:
return super().from_files(
files=files,
begin=begin,
end=end,
name=name,
**kwargs,
)


class DummyDataset(BaseDataset[DummyData, DummyFile]):
@classmethod
def _data_from_dict(cls, dictionary: dict) -> list[TData]:
return [DummyData.from_dict(data) for data in dictionary.values()]

@classmethod
def _data_from_files(
cls,
files: list[DummyFile],
begin: Timestamp | None = None,
end: Timestamp | None = None,
name: str | None = None,
**kwargs,
) -> TData:
return DummyData.from_files(
files=files,
begin=begin,
end=end,
name=name,
)

file_cls = DummyFile
Loading