diff --git a/docs/concepts/upath.md b/docs/concepts/upath.md index db1a4b5d..c75b980b 100644 --- a/docs/concepts/upath.md +++ b/docs/concepts/upath.md @@ -239,9 +239,70 @@ process_file( ) ``` +## Path Equality and Filesystem Identity + +Unlike `pathlib.Path` which compares paths by their string representation alone, `UPath` considers **filesystem identity** when comparing paths. Two UPaths are equal if they refer to the same file on the same filesystem. + +### How Equality Works + +```python +from upath import UPath + +# Same path, same filesystem -> equal (even with different options) +UPath('s3://bucket/file.txt') == UPath('s3://bucket/file.txt', anon=True) # True + +# Same path, different filesystem -> not equal +UPath('s3://bucket/file.txt') != UPath('s3://bucket/file.txt', + endpoint_url='http://localhost:9000') # True +``` + +### Filesystem Identity (fsid) + +UPath uses **fsid** (filesystem identifier) to determine if two paths are on the same filesystem. If a cached filesystem exists and implements fsid, that value is used. Otherwise, fsid is computed from the protocol, storage_options, and fsspec global config (`fsspec.config.conf`), **without instantiating the filesystem**. This allows path comparison to work abstractly without requiring credentials or network access. + +Unlike fsspec filesystems which raise `NotImplementedError` when fsid is not implemented, `UPath.fsid` returns `None` if the filesystem identity cannot be determined (e.g., for unknown protocols or wrapper filesystems). When fsid is `None`, path comparison falls back to comparing `storage_options` directly: + +| Filesystem | Identity Based On | +|------------|-------------------| +| Local (`file://`, paths) | Always `"local"` | +| HTTP/HTTPS | Always `"http"` | +| S3 | `endpoint_url` (AWS endpoints normalized) | +| GCS | Always `"gcs"` (single global endpoint) | +| Azure Blob | `account_name` | +| SFTP/SSH | `host` + `port` | +| SMB | `host` + `port` | + +Options like authentication (`anon`, `key`, `token`), performance settings (`block_size`), and behavior flags (`auto_mkdir`) don't affect filesystem identity. + +### Impact on Path Operations + +Filesystem identity affects `relative_to()`, `is_relative_to()`, and parent comparisons: + +```python +from upath import UPath + +base = UPath('s3://bucket/data') +child = UPath('s3://bucket/data/file.txt', anon=True) + +# Works: same filesystem despite different storage_options +child.relative_to(base) # PurePosixPath('file.txt') +child.is_relative_to(base) # True +base in child.parents # True +``` + +### Comparison with pathlib.Path + +| Aspect | `pathlib.Path` | `UPath` | +|--------|----------------|---------| +| Equality based on | Path string only | Protocol + path + filesystem identity | +| `storage_options` | N/A | Ignored if fsid can be determined | +| Different credentials | N/A | Equal (same filesystem) | +| Different endpoints | N/A | Not equal (different filesystem) | + ## Learn More - **pathlib concepts**: See [pathlib.md](pathlib.md) for details on the pathlib API - **fsspec backends**: See [filesystems.md](fsspec.md) for information about available filesystems - **API reference**: Check the [API documentation](../api/index.md) for complete method details - **fsspec details**: Visit [fsspec documentation](https://filesystem-spec.readthedocs.io/) for filesystem-specific options +- **Migration guide**: See [migration.md](../migration.md) for version-specific changes diff --git a/docs/migration.md b/docs/migration.md index 1e7569d9..5c07d969 100644 --- a/docs/migration.md +++ b/docs/migration.md @@ -9,6 +9,104 @@ This guide helps you migrate to newer versions of universal-pathlib. and this guide is missing information. +## Migrating to v0.4.0 + +Version `0.4.0` changes how `UPath` determines path equality. Previously, paths with different `storage_options` were always considered unequal. Now, equality is based on **filesystem identity** (fsid), which ignores options that don't affect which filesystem is being accessed. + +### Background: The Problem with storage_options Equality + +In versions prior to `0.4.0`, `UPath.__eq__` compared `storage_options` directly: + +```python +# Pre-0.4.0 behavior (unintuitive) +from upath import UPath + +# Same S3 file, but different auth options -> NOT equal +UPath('s3://bucket/file.txt') == UPath('s3://bucket/file.txt', anon=True) # False + +# Same local file, but different behavior options -> NOT equal +UPath('/tmp/file.txt') == UPath('/tmp/file.txt', auto_mkdir=True) # False +``` + +This caused subtle bugs when comparing paths that referred to the same filesystem resource. Methods like `relative_to()` and `is_relative_to()` would fail unexpectedly: + +```python +# Pre-0.4.0: This raised ValueError despite referring to the same S3 bucket +p1 = UPath('s3://bucket/dir/file.txt', anon=True) +p2 = UPath('s3://bucket/dir') +p1.relative_to(p2) # ValueError: incompatible storage_options +``` + +### New Behavior: Filesystem Identity (fsid) + +Starting with `0.4.0`, equality is based on filesystem identity. Two UPaths are equal if they have the same protocol, path, and filesystem identity—regardless of authentication or performance options: + +```python +# v0.4.0+ behavior +from upath import UPath + +# Same filesystem, different options -> equal +UPath('s3://bucket/file.txt') == UPath('s3://bucket/file.txt', anon=True) # True +UPath('/tmp/file.txt') == UPath('/tmp/file.txt', auto_mkdir=True) # True + +# Different filesystems -> not equal +UPath('s3://bucket/file.txt') != UPath('s3://bucket/file.txt', + endpoint_url='http://localhost:9000') # True (MinIO vs AWS) +``` + +**Options ignored for equality** (don't affect filesystem identity): + +- Authentication: `anon`, `key`, `secret`, `token`, `profile` +- Performance: `default_block_size`, `default_cache_type`, `max_concurrency` +- Behavior: `auto_mkdir`, `default_acl`, `requester_pays` + +**Options that affect equality** (change which filesystem is accessed): + +- S3: Different `endpoint_url` (e.g., AWS vs MinIO vs LocalStack) +- Azure: Different `account_name` +- SFTP/SMB/FTP: Different `host` or `port` + +### Impact on Path Operations + +The `relative_to()` and `is_relative_to()` methods now use filesystem identity: + +```python +from upath import UPath + +p1 = UPath('s3://bucket/dir/file.txt', anon=True) +p2 = UPath('s3://bucket/dir') # Different storage_options, same filesystem + +# v0.4.0+: Works because both paths are on the same S3 filesystem +p1.is_relative_to(p2) # True +p1.relative_to(p2) # PurePosixPath('file.txt') + +# Different endpoints are correctly rejected +p3 = UPath('s3://bucket/dir', endpoint_url='http://localhost:9000') +p1.is_relative_to(p3) # False (different filesystem) +p1.relative_to(p3) # ValueError: incompatible filesystems +``` + +### Migration Checklist + +If your code relied on the previous behavior where different `storage_options` meant different paths: + +1. **Review equality checks**: Code that expected `UPath(url, opt1=x) != UPath(url, opt1=y)` may now return `True` if they're on the same filesystem. + +2. **Check set/dict usage**: Paths that were previously distinct dict keys or set members may now collide. Note that `__hash__` already ignored `storage_options`, so this is unlikely to be a new issue. + +3. **Update tests**: Tests that asserted inequality based on `storage_options` differences may need updating. + +### Fallback Behavior + +For filesystems where UPath cannot determine identity (e.g., memory filesystem, unknown protocols), it falls back to comparing `storage_options` directly—preserving pre-0.4.0 behavior: + +```python +from upath import UPath + +# Memory filesystem: no fsid, falls back to storage_options comparison +UPath('memory:///file.txt', opt=1) != UPath('memory:///file.txt', opt=2) # True +``` + ## Migrating to v0.3.0 Version `0.3.0` introduced a breaking change to fix a longstanding bug related to `os.PathLike` protocol compliance. This change affects how UPath instances work with standard library functions that expect local filesystem paths. diff --git a/upath/_fsid.py b/upath/_fsid.py new file mode 100644 index 00000000..bef83d74 --- /dev/null +++ b/upath/_fsid.py @@ -0,0 +1,129 @@ +"""Filesystem identity (fsid) fallback computation. + +This module provides `_fallback_fsid` to compute filesystem identity from +protocol, storage_options, and fsspec global config (`fsspec.config.conf`) +without instantiating the filesystem. + +The fsid is used by __eq__, relative_to, and is_relative_to to determine +if two paths are on the same filesystem. The key insight is that many +storage_options (like authentication or performance settings) don't affect +*which* filesystem is being accessed, only *how* it's accessed. + +For filesystems where fsid cannot be determined (e.g., memory filesystem, +unknown protocols), returns None and callers fall back to comparing +storage_options directly. +""" + +from __future__ import annotations + +from collections import ChainMap +from collections.abc import Mapping +from typing import Any + +from fsspec.config import conf as fsspec_conf +from fsspec.utils import tokenize + +__all__ = ["_fallback_fsid"] + + +def _fallback_fsid(protocol: str, storage_options: Mapping[str, Any]) -> str | None: + """Compute fsid from protocol, storage_options, and fsspec global config.""" + global_opts = fsspec_conf.get(protocol) + opts: Mapping[str, Any] = ( + ChainMap(storage_options, global_opts) # type: ignore[arg-type] + if global_opts + else storage_options + ) + + match protocol: + # Static fsid (no instance attributes needed) + case "" | "file" | "local": + return "local" + case "http" | "https": + return "http" + case "memory" | "memfs": + return None # Non-durable, fall back to storage_options + case "data": + return None # Non-durable + + # Host + port based + case "sftp" | "ssh": + host = opts.get("host", "") + port = opts.get("port", 22) + return f"sftp_{tokenize(host, port)}" if host else None + case "smb": + host = opts.get("host", "") + port = opts.get("port", 445) + return f"smb_{tokenize(host, port)}" if host else None + case "ftp": + host = opts.get("host", "") + port = opts.get("port", 21) + return f"ftp_{tokenize(host, port)}" if host else None + case "webhdfs" | "webHDFS": + host = opts.get("host", "") + port = opts.get("port", 50070) + return f"webhdfs_{tokenize(host, port)}" if host else None + + # Cloud object storage + case "s3" | "s3a": + endpoint = opts.get("endpoint_url", "https://s3.amazonaws.com") + # Normalize AWS endpoints + from urllib.parse import urlparse + + parsed = urlparse(endpoint) + if parsed.netloc.endswith(".amazonaws.com"): + return "s3_aws" + return f"s3_{tokenize(endpoint)}" + case "gcs" | "gs": + return "gcs" # Single global endpoint + case "abfs" | "az": + account = opts.get("account_name", "") + return f"abfs_{tokenize(account)}" if account else None + case "adl": + tenant = opts.get("tenant_id", "") + store = opts.get("store_name", "") + return f"adl_{tokenize(tenant, store)}" if tenant and store else None + case "oci": + region = opts.get("region", "") + return f"oci_{tokenize(region)}" if region else None + case "oss": + endpoint = opts.get("endpoint", "") + return f"oss_{tokenize(endpoint)}" if endpoint else None + + # Git-based + case "git": + path = opts.get("path", "") + ref = opts.get("ref", "") + return f"git_{tokenize(path, ref)}" if path else None + case "github": + org = opts.get("org", "") + repo = opts.get("repo", "") + sha = opts.get("sha", "") + return f"github_{tokenize(org, repo, sha)}" if org and repo else None + + # Platform-specific + case "hf": + endpoint = opts.get("endpoint", "huggingface.co") + return f"hf_{tokenize(endpoint)}" + case "lakefs": + host = opts.get("host", "") + return f"lakefs_{tokenize(host)}" if host else None + case "webdav": + base_url = opts.get("base_url", "") + return f"webdav_{tokenize(base_url)}" if base_url else None + case "box": + return "box" + case "dropbox": + return "dropbox" + + # Wrappers - delegate to underlying + case "simplecache" | "filecache" | "blockcache" | "cached": + return None # Complex, fall back + + # Archive filesystems - need underlying fs info + case "zip" | "tar": + return None # Complex, fall back + + # Default: unknown protocol, fall back to storage_options + case _: + return None diff --git a/upath/core.py b/upath/core.py index d572a4c9..62cb14f0 100644 --- a/upath/core.py +++ b/upath/core.py @@ -30,6 +30,7 @@ from upath._chain import DEFAULT_CHAIN_PARSER from upath._chain import Chain from upath._chain import FSSpecChainParser +from upath._fsid import _fallback_fsid from upath._flavour import LazyFlavourDescriptor from upath._flavour import WrappedFileSystemFlavour from upath._flavour import upath_get_kwargs_from_url @@ -302,6 +303,47 @@ def fs(self) -> AbstractFileSystem: ) return fs + @property + def fsid(self) -> str | None: + """The filesystem identifier for this path. + + Returns a string that identifies the filesystem this path is on, + or None if the filesystem identity cannot be determined. Used by + __eq__, relative_to, and is_relative_to to compare paths. + + The fsid ignores storage_options that don't affect which filesystem + is accessed (e.g., auth, performance settings). Options that do + affect identity (e.g., endpoint_url for S3) are included. + + Note + ---- + This property does not instantiate the filesystem. If a cached + filesystem exists and implements fsid, that value is used. Otherwise, + fsid is computed from protocol, storage_options, and fsspec global + config (``fsspec.config.conf``) without filesystem access. + + Unlike fsspec filesystems which raise NotImplementedError when fsid + is not implemented, UPath.fsid returns None if the filesystem identity + cannot be determined (e.g., for unknown protocols or wrapper filesystems). + When fsid is None, path comparison falls back to comparing storage_options + directly. + + Examples + -------- + >>> from upath import UPath + >>> UPath('/tmp/file.txt').fsid + 'local' + >>> UPath('s3://bucket/key').fsid + 's3_aws' + >>> UPath('s3://bucket/key', endpoint_url='http://localhost:9000').fsid + 's3_...' # Different hash for non-AWS endpoint + + """ + try: + return self._fs_cached.fsid + except (AttributeError, NotImplementedError): + return _fallback_fsid(self.protocol, self.storage_options) + @property def path(self) -> str: """The path used by fsspec filesystem. @@ -1696,8 +1738,17 @@ def is_absolute(self) -> bool: return self.parser.isabs(self.__vfspath__()) def __eq__(self, other: object) -> bool: - """UPaths are considered equal if their protocol, path and - storage_options are equal.""" + """Check equality based on protocol, path, and filesystem identity. + + Two UPaths are equal if they refer to the same file on the same + filesystem. Filesystem identity is determined by fsid, which ignores + options that don't affect which filesystem is accessed (e.g., auth, + performance settings). Options that do affect identity (e.g., + endpoint_url for S3) will make paths unequal. + + For filesystems where fsid cannot be determined, falls back to + comparing storage_options directly. + """ if not isinstance(other, UPath): return NotImplemented @@ -1716,11 +1767,14 @@ def __eq__(self, other: object) -> bool: # One is relative, one is not - they can't be equal return False - return ( - self.__vfspath__() == other.__vfspath__() - and self.protocol == other.protocol - and self.storage_options == other.storage_options - ) + if self.__vfspath__() != other.__vfspath__() or self.protocol != other.protocol: + return False + + fsid1, fsid2 = self.fsid, other.fsid + if fsid1 is not None and fsid2 is not None: + return fsid1 == fsid2 + + return self.storage_options == other.storage_options def __hash__(self) -> int: """The returned hash is based on the protocol and path only. @@ -2063,7 +2117,11 @@ def relative_to( # type: ignore[override] "incompatible protocols:" f" {self.protocol!r} != {other.protocol!r}" ) - if self.storage_options != other.storage_options: + fsid1, fsid2 = self.fsid, other.fsid + if fsid1 is not None and fsid2 is not None: + if fsid1 != fsid2: + raise ValueError(f"incompatible filesystems: {fsid1!r} != {fsid2!r}") + elif self.storage_options != other.storage_options: raise ValueError( "incompatible storage_options:" f" {self.storage_options!r} != {other.storage_options!r}" @@ -2087,8 +2145,13 @@ def is_relative_to( *_deprecated: Any, ) -> bool: # type: ignore[override] """Return True if the path is relative to another path identified.""" - if isinstance(other, UPath) and self.storage_options != other.storage_options: - return False + if isinstance(other, UPath): + fsid1, fsid2 = self.fsid, other.fsid + if fsid1 is not None and fsid2 is not None: + if fsid1 != fsid2: + return False + elif self.storage_options != other.storage_options: + return False elif isinstance(other, str): other = self.with_segments(other) return self == other or other in self.parents diff --git a/upath/extensions.py b/upath/extensions.py index a60e7ca8..fcdc45b5 100644 --- a/upath/extensions.py +++ b/upath/extensions.py @@ -455,6 +455,10 @@ def storage_options(self) -> Mapping[str, Any]: def fs(self) -> AbstractFileSystem: return self.__wrapped__.fs + @property + def fsid(self) -> str | None: + return self.__wrapped__.fsid + @property def path(self) -> str: return self.__wrapped__.path diff --git a/upath/implementations/local.py b/upath/implementations/local.py index 3bf3ec76..819590f7 100644 --- a/upath/implementations/local.py +++ b/upath/implementations/local.py @@ -217,11 +217,12 @@ def __eq__(self, other: object) -> bool: eq_path = super().__eq__(other) if eq_path is NotImplemented: return NotImplemented - return ( - eq_path - and self.protocol == other.protocol - and self.storage_options == other.storage_options - ) + if not eq_path or self.protocol != other.protocol: + return False + fsid1, fsid2 = self.fsid, other.fsid + if fsid1 is not None and fsid2 is not None: + return fsid1 == fsid2 + return self.storage_options == other.storage_options def __ne__(self, other: object) -> bool: if not isinstance(other, UPath): @@ -229,11 +230,12 @@ def __ne__(self, other: object) -> bool: ne_path = super().__ne__(other) if ne_path is NotImplemented: return NotImplemented - return ( - ne_path - or self.protocol != other.protocol - or self.storage_options != other.storage_options - ) + if ne_path or self.protocol != other.protocol: + return True + fsid1, fsid2 = self.fsid, other.fsid + if fsid1 is not None and fsid2 is not None: + return fsid1 != fsid2 + return self.storage_options != other.storage_options def __hash__(self) -> int: return super().__hash__() diff --git a/upath/tests/cases.py b/upath/tests/cases.py index 22587fa9..fd2f5aaf 100644 --- a/upath/tests/cases.py +++ b/upath/tests/cases.py @@ -185,8 +185,13 @@ def test_eq(self): p1 = cls(str(self.path), test_extra=1, **self.path.storage_options) p2 = cls(str(self.path), test_extra=2, **self.path.storage_options) assert p0 == p1 - assert p0 != p2 - assert p1 != p2 + # When fsid is defined, paths with same path and fsid are equal + # regardless of storage_options. When fsid is None, paths fall back + # to storage_options comparison. + if p0.fsid is not None: + assert p0 == p2 # Same fsid, so equal despite different storage_options + else: + assert p0 != p2 # No fsid, falls back to storage_options comparison def test_relative_to(self): base = self.path diff --git a/upath/tests/test_core.py b/upath/tests/test_core.py index 9d6d9317..ff521a96 100644 --- a/upath/tests/test_core.py +++ b/upath/tests/test_core.py @@ -342,10 +342,13 @@ def test_relative_to(): with pytest.raises(ValueError): UPath("s3://test_bucket/file.txt").relative_to(UPath("gcs://test_bucket")) - with pytest.raises(ValueError): + # S3 paths with different auth options but same endpoint should work + # (they have the same fsid "s3_aws") + assert "file.txt" == str( UPath("s3://test_bucket/file.txt", anon=True).relative_to( UPath("s3://test_bucket", anon=False) ) + ) def test_uri_parsing(): diff --git a/upath/tests/test_fsid.py b/upath/tests/test_fsid.py new file mode 100644 index 00000000..cb7a1007 --- /dev/null +++ b/upath/tests/test_fsid.py @@ -0,0 +1,191 @@ +"""Tests for fsid-based path equivalence.""" + +import pytest + +from upath import UPath + + +# --- __eq__ tests --- + + +def test_eq_with_fsid_local(tmp_path): + """Local paths with different storage_options should be equal.""" + p1 = UPath(tmp_path / "test.txt") + p2 = UPath(tmp_path / "test.txt", auto_mkdir=True) + assert p1 == p2 + + +def test_eq_with_fsid_http(): + """HTTP paths with different storage_options should be equal.""" + p1 = UPath("http://example.com/file.txt") + p2 = UPath("http://example.com/file.txt", block_size=1024) + assert p1 == p2 + + +def test_eq_http_https_different_protocol(): + """HTTP and HTTPS are different protocols, so paths are not equal.""" + p1 = UPath("http://example.com/file.txt") + p2 = UPath("https://example.com/file.txt") + assert p1 != p2 + + +def test_eq_different_filesystem(): + """Paths on different filesystems should not be equal.""" + p1 = UPath("/tmp/file.txt") + p2 = UPath("memory:///tmp/file.txt") + assert p1 != p2 + + +def test_eq_s3_same_endpoint(): + """S3 paths with same endpoint but different auth should be equal.""" + p1 = UPath("s3://bucket/key") + p2 = UPath("s3://bucket/key", anon=True) + assert p1 == p2 + + +def test_eq_s3_different_endpoint(): + """S3 paths with different endpoints should not be equal.""" + p1 = UPath("s3://bucket/key") + p2 = UPath("s3://bucket/key", endpoint_url="http://localhost:9000") + assert p1 != p2 + + +# --- relative_to tests --- + + +def test_relative_to_with_fsid(tmp_path): + """relative_to should work when fsids match.""" + p1 = UPath(tmp_path / "dir" / "file.txt") + p2 = UPath(tmp_path / "dir", auto_mkdir=True) + rel = p1.relative_to(p2) + assert str(rel) == "file.txt" + + +def test_relative_to_different_fsid(): + """relative_to should raise when fsids differ.""" + p1 = UPath("s3://bucket/dir/file.txt") + p2 = UPath("s3://bucket/dir", endpoint_url="http://localhost:9000") + with pytest.raises(ValueError, match="incompatible filesystems"): + p1.relative_to(p2) + + +# --- is_relative_to tests --- + + +def test_is_relative_to_with_fsid(tmp_path): + """is_relative_to should return True when fsids match.""" + p1 = UPath(tmp_path / "dir" / "file.txt") + p2 = UPath(tmp_path / "dir", auto_mkdir=True) + assert p1.is_relative_to(p2) + + +def test_is_relative_to_different_fsid(): + """is_relative_to should return False when fsids differ.""" + p1 = UPath("s3://bucket/dir/file.txt") + p2 = UPath("s3://bucket/dir", endpoint_url="http://localhost:9000") + assert not p1.is_relative_to(p2) + + +# --- _fallback_fsid audit tests --- +# These tests verify that our fallback fsid computation matches +# the native fsid implementations in fsspec filesystems. + + +def test_fallback_matches_local_filesystem(): + """Verify _fallback_fsid matches LocalFileSystem.fsid.""" + from upath._fsid import _fallback_fsid + + p = UPath("/tmp/test.txt") + native_fsid = p.fs.fsid + fallback_fsid = _fallback_fsid(p.protocol, p.storage_options) + assert native_fsid == fallback_fsid == "local" + + +def test_fallback_matches_http_filesystem(): + """Verify _fallback_fsid matches HTTPFileSystem.fsid.""" + from upath._fsid import _fallback_fsid + + for url in ["http://example.com/file.txt", "https://example.com/file.txt"]: + p = UPath(url) + native_fsid = p.fs.fsid + fallback_fsid = _fallback_fsid(p.protocol, p.storage_options) + assert native_fsid == fallback_fsid == "http" + + +def test_fsid_consistency_cached_vs_uncached(tmp_path): + """Verify fsid is consistent whether filesystem is cached or not.""" + # Create two paths - check fsid before and after fs access + p1 = UPath(tmp_path / "test.txt") + p2 = UPath(tmp_path / "test.txt", auto_mkdir=True) + + # Before any fs access (uses fallback) + fsid1_before = p1.fsid + fsid2_before = p2.fsid + + # Access fs on p1 only (p1 now uses cached fs.fsid) + _ = p1.fs + fsid1_after = p1.fsid + fsid2_still_fallback = p2.fsid + + # All should be equal + assert fsid1_before == fsid2_before == fsid1_after == fsid2_still_fallback == "local" + + +def test_fallback_uses_global_config(): + """Verify _fallback_fsid incorporates fsspec global config.""" + from fsspec.config import conf as fsspec_conf + + from upath._fsid import _fallback_fsid + + # Before setting config - default AWS + assert _fallback_fsid("s3", {}) == "s3_aws" + + # Set global config + fsspec_conf["s3"] = {"endpoint_url": "http://minio.local:9000"} + try: + # Should now use the global config endpoint + fsid_with_config = _fallback_fsid("s3", {}) + assert fsid_with_config != "s3_aws" + assert fsid_with_config.startswith("s3_") + + # Explicit storage_options should override global config + assert _fallback_fsid("s3", {"endpoint_url": "https://s3.amazonaws.com"}) == "s3_aws" + finally: + # Clean up + del fsspec_conf["s3"] + + +def test_fallback_ignores_auth_options(): + """Verify auth options don't affect fsid.""" + from upath._fsid import _fallback_fsid + + base = _fallback_fsid("s3", {}) + with_anon = _fallback_fsid("s3", {"anon": True}) + with_key = _fallback_fsid("s3", {"key": "xxx", "secret": "yyy"}) + + assert base == with_anon == with_key == "s3_aws" + + +def test_fallback_respects_identity_options(): + """Verify identity-relevant options produce different fsids.""" + from upath._fsid import _fallback_fsid + + # S3: different endpoints = different fsid + aws = _fallback_fsid("s3", {}) + minio = _fallback_fsid("s3", {"endpoint_url": "http://localhost:9000"}) + assert aws != minio + + # SFTP: different hosts = different fsid + host1 = _fallback_fsid("sftp", {"host": "server1.com"}) + host2 = _fallback_fsid("sftp", {"host": "server2.com"}) + assert host1 != host2 + + # SFTP: different ports = different fsid + port22 = _fallback_fsid("sftp", {"host": "server.com", "port": 22}) + port2222 = _fallback_fsid("sftp", {"host": "server.com", "port": 2222}) + assert port22 != port2222 + + # Azure: different accounts = different fsid + acc1 = _fallback_fsid("abfs", {"account_name": "storage1"}) + acc2 = _fallback_fsid("abfs", {"account_name": "storage2"}) + assert acc1 != acc2 diff --git a/upath/tests/test_relative.py b/upath/tests/test_relative.py index 8d71bb45..e8623be4 100644 --- a/upath/tests/test_relative.py +++ b/upath/tests/test_relative.py @@ -98,11 +98,19 @@ def test_relative_path_validation(): with pytest.raises(ValueError, match="incompatible protocols"): p.relative_to(UPath("s3://bucket")) - # Different storage options should fail - with pytest.raises(ValueError, match="incompatible storage_options"): + # S3 paths with different auth options but same endpoint should work + # (they have the same fsid "s3_aws") + assert "file" == str( UPath("s3://bucket/file", anon=True).relative_to( UPath("s3://bucket", anon=False) ) + ) + + # Different endpoints should fail (different fsid) + with pytest.raises(ValueError, match="incompatible filesystems"): + UPath("s3://bucket/file").relative_to( + UPath("s3://bucket", endpoint_url="http://localhost:9000") + ) def test_path_not_in_subpath():