Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Changelogs for this project are recorded in this file since v0.2.0.
### Added

* Allow parallel computation of DTW barycenters and plug it in `TimeSeriesKMeans`.
* `tslearn.metrics.sbd` and `tslearn.metrics.cdist_sbd` expose the Shape-Based Distance used inside `KShape` as stand-alone functions ([#276](https://github.com/tslearn-team/tslearn/issues/276)).

### Changed

Expand Down
44 changes: 44 additions & 0 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1111,3 +1111,47 @@ def test_sax():
dataset2=[[-1, 0, 1], [1, 0, 1]],
)
np.testing.assert_equal(dists, expected)


def test_sbd_public():
# Regression test for #276: SBD must be exposed as a stand-alone scalar
# function, the way dtw / soft_dtw / lcss already are. The result must
# also match what KShape uses internally (1 - cdist_normalized_cc), so
# callers can reproduce KShape's distance without monkey-patching.
from tslearn.metrics import sbd, cdist_sbd
from tslearn.metrics.cycc import cdist_normalized_cc

# 1) Identical series → SBD = 0.
assert float(sbd([1.0, 2.0, 3.0], [1.0, 2.0, 3.0])) == 0.0

# 2) Mismatched shapes are rejected with a clear error.
with pytest.raises(ValueError):
sbd([1.0, 2.0, 3.0], [[1.0, 2.0], [3.0, 4.0]])

# 3) cdist_sbd returns a true distance matrix (zero on the diagonal,
# symmetric, non-negative). The legacy self_similarity=True branch of
# cdist_normalized_cc is *not* a valid SBD matrix on its own — that's
# the bug this PR works around.
rng = np.random.RandomState(0)
X = rng.rand(4, 6, 1).astype(np.float64)
D = cdist_sbd(X)
assert D.shape == (4, 4)
np.testing.assert_allclose(np.diag(D), np.zeros(4), atol=1e-12)
np.testing.assert_allclose(D, D.T, atol=1e-12)
assert (D >= -1e-12).all()

# 4) Scalar sbd matches the pairwise matrix entries — same path KShape
# takes.
np.testing.assert_allclose(sbd(X[0], X[1]), D[0, 1], atol=1e-12)

# 5) Asymmetric two-dataset call has the right shape and matches scalar.
Y = rng.rand(2, 6, 1).astype(np.float64)
D2 = cdist_sbd(X, Y)
assert D2.shape == (4, 2)
np.testing.assert_allclose(sbd(X[2], Y[1]), D2[2, 1], atol=1e-12)

# 6) Cross-check with raw normalized_cc — this is the contract callers
# rely on when porting KShape-style code.
norms = np.full(4, -1.0, dtype=np.float64)
cc = cdist_normalized_cc(X, X, norms.copy(), norms.copy(), False)
np.testing.assert_allclose(D, 1.0 - cc, atol=1e-12)
3 changes: 3 additions & 0 deletions tslearn/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
)
from .soft_dtw_loss_pytorch import SoftDTWLossPyTorch
from .cycc import cdist_normalized_cc, y_shifted_sbd_vec
from ._sbd import sbd, cdist_sbd
from ._frechet import (
frechet,
frechet_path,
Expand Down Expand Up @@ -129,6 +130,8 @@
"SoftDTWLossPyTorch",
"cdist_normalized_cc",
"y_shifted_sbd_vec",
"sbd",
"cdist_sbd",
Comment on lines +133 to +134

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needs to be added to docs/gen_modules/tslearn.metrics.rst for proper API documentation

"frechet",
"frechet_path",
"frechet_accumulated_matrix",
Expand Down
130 changes: 130 additions & 0 deletions tslearn/metrics/_sbd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""Public Shape-Based Distance (SBD) helpers.

SBD is the distance used inside :class:`tslearn.clustering.KShape`. Until now
it was only available indirectly via :func:`cdist_normalized_cc`. Issue #276
asks for a function-level handle, the way :func:`tslearn.metrics.dtw` exposes
DTW as a stand-alone distance.
Comment on lines +3 to +6

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if the "current" state and the issue is worth mentionning in the module docstring even though this won't be included in the API docs.


The implementation defers to the existing numba-jitted
:func:`tslearn.metrics.cycc.normalized_cc` so behaviour matches what KShape
already does internally — see ``KShape._cross_dists`` in
``tslearn/clustering/kshape.py``.
"""
import numpy

from .cycc import cdist_normalized_cc, normalized_cc
from ..utils import to_time_series, to_time_series_dataset


def sbd(s1, s2):
r"""Shape-Based Distance (SBD) between two time series.

SBD is defined in [1]_ as

.. math::

\mathrm{SBD}(\mathbf{x}, \mathbf{y}) =
1 - \max_{w}\;\frac{\mathrm{NCC}_w(\mathbf{x}, \mathbf{y})}
{\|\mathbf{x}\|_2 \cdot \|\mathbf{y}\|_2}

where :math:`\mathrm{NCC}_w` denotes the cross-correlation of the two
series at lag :math:`w`. SBD is the distance used by
:class:`tslearn.clustering.KShape`.

Parameters
----------
s1 : array-like, shape=(sz, d) or (sz,)
A time series.
s2 : array-like, shape=(sz, d) or (sz,)
Another time series of the same length and dimensionality as ``s1``.

Returns
-------
float
SBD value in :math:`[0, 2]`. ``0`` means perfect shape match (up to a
cyclic shift), ``2`` means perfectly anti-correlated.

Examples
--------
>>> import numpy
>>> float(sbd([1., 2., 3.], [1., 2., 3.]))
0.0
>>> # Equal-length series with a partial shape match
>>> float(round(sbd([1., 2., 3.], [3., 2., 1.]), 4))
0.1429

See Also
--------
cdist_sbd : Pairwise SBD on two datasets.
cdist_normalized_cc : Underlying cross-correlation matrix.
tslearn.clustering.KShape : Clustering algorithm built on SBD.

References
----------
.. [1] J. Paparrizos and L. Gravano. k-Shape: Efficient and Accurate
Clustering of Time Series. SIGMOD 2015.
"""
# SBD requires matching length / dimensionality. Use to_time_series so
# callers can pass plain Python lists or 1-D arrays just like dtw().
s1 = to_time_series(s1)
s2 = to_time_series(s2)
if s1.shape != s2.shape:
raise ValueError(
"sbd() requires both time series to have the same shape, "
f"got {s1.shape} and {s2.shape}."
)
# normalized_cc returns the full lag-correlation vector; SBD is 1 minus
# its max, matching KShape._cross_dists.
cc = normalized_cc(s1.astype(numpy.float64), s2.astype(numpy.float64))

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to_time_series already uses dtype=float which is numpy.float64 for numpy backend. torch backend won't support astype anyway, so i think astype are redundant.

return 1.0 - cc.max()


def cdist_sbd(dataset1, dataset2=None):
"""Pairwise Shape-Based Distance between two time-series datasets.

Parameters
----------
dataset1 : array-like, shape=(n_ts1, sz, d) or (n_ts1, sz)
First dataset of time series.
dataset2 : array-like, shape=(n_ts2, sz, d) or (n_ts2, sz), optional
Second dataset. If ``None`` (default), pairwise SBD is computed within
``dataset1``.

Returns
-------
numpy.ndarray of shape (n_ts1, n_ts2)
Pairwise SBD values. Same convention as :func:`sbd`: ``0`` means
identical shape, ``2`` means perfectly anti-correlated.

Examples
--------
>>> import numpy
>>> X = numpy.array([[[1.], [2.], [3.]], [[1.], [2.], [3.]]])
>>> dists = cdist_sbd(X)
>>> dists.shape
(2, 2)
>>> float(dists[0, 1])
0.0

See Also
--------
sbd : Scalar SBD between two time series.
cdist_normalized_cc : Underlying cross-correlation matrix.
"""
dataset1 = to_time_series_dataset(dataset1).astype(numpy.float64)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to_time_series_dataset already uses dtype=float which is numpy.float64 for numpy backend and torch backend won't support astype anyway, so i think astype is redundant.

if dataset2 is None:
dataset2 = dataset1
else:
dataset2 = to_time_series_dataset(dataset2).astype(numpy.float64)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above

# cdist_normalized_cc expects pre-allocated norm vectors; passing -1.0
# tells the kernel to compute them on the fly. We always pass
# self_similarity=False — the self_similarity=True branch of that kernel
# is tailored for KShape (zero diagonal, lower-triangle fill) and would
# silently produce SBD=1 on the diagonal, which is wrong for callers who
# want a true distance matrix where diag(SBD) == 0.
norms1 = numpy.full(dataset1.shape[0], -1.0, dtype=numpy.float64)
norms2 = numpy.full(dataset2.shape[0], -1.0, dtype=numpy.float64)
cc = cdist_normalized_cc(
dataset1, dataset2, norms1, norms2, False
)
return 1.0 - cc