diff --git a/docs/changes/newsfragments/8266.new b/docs/changes/newsfragments/8266.new new file mode 100644 index 000000000000..e86d8b829934 --- /dev/null +++ b/docs/changes/newsfragments/8266.new @@ -0,0 +1,12 @@ +Added :func:`qcodes.dataset.get_db_overview`, a fast way to list the runs in a +QCoDeS database. It fetches run metadata (experiment/sample names, timestamps, +record counts and guids) via a single ``JOIN`` query on the ``runs`` and +``experiments`` tables, without instantiating a ``DataSet`` object per run, +making it possible to list databases with many thousands of runs almost +instantly. The returned :class:`qcodes.dataset.RunOverviewDict` is also exported +on the public ``qcodes.dataset`` namespace. The function does not return +snapshots as they can be large and slow down building the database overview. +The number of results in each dataset is taken from shape information when +available, and otherwise falls back to a best-effort count of the rows in the +so-called results table that exists for every dataset, which may NOT be as +precise as ``DataSet.number_of_results``. diff --git a/src/qcodes/dataset/__init__.py b/src/qcodes/dataset/__init__.py index 88e5b43d7f24..0494069ab0b5 100644 --- a/src/qcodes/dataset/__init__.py +++ b/src/qcodes/dataset/__init__.py @@ -56,6 +56,7 @@ initialise_or_create_database_at, initialised_database_at, ) +from .sqlite.db_overview import RunOverviewDict, get_db_overview from .sqlite.settings import SQLiteSettings from .threading import ( SequentialParamsCaller, @@ -79,6 +80,7 @@ "ParamSpec", "ParamSpecTree", "RunDescriber", + "RunOverviewDict", "SQLiteSettings", "SequentialParamsCaller", "ThreadPoolParamsCaller", @@ -95,6 +97,7 @@ "export_datasets_and_create_metadata_db", "extract_runs_into_db", "get_data_export_path", + "get_db_overview", "get_default_experiment_id", "get_guids_by_run_spec", "guids_from_dbs", diff --git a/src/qcodes/dataset/sqlite/db_overview.py b/src/qcodes/dataset/sqlite/db_overview.py new file mode 100644 index 000000000000..08150f018224 --- /dev/null +++ b/src/qcodes/dataset/sqlite/db_overview.py @@ -0,0 +1,260 @@ +""" +This module provides a fast, lightweight overview of the runs stored in a +QCoDeS database. + +:func:`get_db_overview` issues a single ``JOIN`` query against the ``runs`` and +``experiments`` tables to collect run metadata (experiment/sample names, time +stamps, record counts, guids, ...) without instantiating a ``DataSet`` object +per run. This avoids the expensive ``experiments()`` + ``data_sets()`` +enumeration and makes it possible to list the contents of databases with many +thousands of runs almost instantly. It is primarily intended for tools that +need to display a table of runs (e.g. dataset browsers). +""" + +from __future__ import annotations + +import datetime +import json +import logging +import sqlite3 +from contextlib import closing, nullcontext +from typing import TYPE_CHECKING + +from typing_extensions import TypedDict + +from qcodes.dataset.sqlite.database import conn_from_dbpath_or_conn +from qcodes.dataset.sqlite.query_helpers import is_column_in_table + +if TYPE_CHECKING: + from collections.abc import Sequence + from pathlib import Path + + from qcodes.dataset.sqlite.connection import AtomicConnection + +log = logging.getLogger(__name__) + + +class RunOverviewDict(TypedDict): + """ + Lightweight overview of a single run. + + Contains only cheap-to-query metadata: no snapshot, no data and no full + ``DataSet`` object. Extra ad-hoc metadata columns requested via the + ``extra_columns`` argument of :func:`get_db_overview` are added to the + dictionary under their column name in addition to the keys documented here. + """ + + #: ``run_id`` of the run. + run_id: int + #: Name of the experiment the run belongs to. + experiment: str + #: Sample name of the experiment the run belongs to. + sample: str + #: Name of the run. + name: str + #: Local date the run was started, formatted as ``YYYY-MM-DD`` (empty + #: string if unknown). + started_date: str + #: Local time the run was started, formatted as ``HH:MM:SS`` (empty string + #: if unknown). + started_time: str + #: Local date the run was completed, formatted as ``YYYY-MM-DD`` (empty + #: string if the run has not completed). + completed_date: str + #: Local time the run was completed, formatted as ``HH:MM:SS`` (empty + #: string if the run has not completed). + completed_time: str + #: Best-effort number of data points in the run, see :func:`get_db_overview`. + records: int + #: guid of the run. + guid: str + + +def _format_timestamp(ts: float | None) -> tuple[str, str]: + """ + Convert a unix timestamp into ``(date, time)`` strings in local time. + + Returns a pair of empty strings if the timestamp is missing or invalid. + """ + if ts is None or ts == 0: + return "", "" + try: + dt = datetime.datetime.fromtimestamp(ts) + except (OSError, ValueError, OverflowError): + return "", "" + return dt.strftime("%Y-%m-%d"), dt.strftime("%H:%M:%S") + + +def _records_from_run_description(run_description_json: str | None) -> int: + """ + Extract a data-point count from the ``shapes`` field of a run description. + + A QCoDeS run description may contain a ``shapes`` mapping from dependent + parameter names to their shape tuples. The count returned here is the sum + over all dependent parameters of the product of their shape dimensions. + Returns ``0`` if the run description is missing, cannot be parsed or does + not contain shape information. + """ + if not run_description_json: + return 0 + try: + desc = json.loads(run_description_json) + except (json.JSONDecodeError, TypeError): + return 0 + shapes = desc.get("shapes") if isinstance(desc, dict) else None + if not shapes: + return 0 + total = 0 + for shape in shapes.values(): + if isinstance(shape, (list, tuple)) and len(shape) > 0: + n = 1 + for dim in shape: + n *= dim + total += n + return total + + +def get_db_overview( + path_to_db: str | Path | None = None, + *, + conn: AtomicConnection | None = None, + start_run_id: int = 0, + extra_columns: Sequence[str] | None = None, +) -> dict[int, RunOverviewDict]: + """ + Get a lightweight overview of the runs in a QCoDeS database. + + This uses a single SQL ``JOIN`` query on the ``runs`` and ``experiments`` + tables to fetch run metadata, avoiding the much more expensive + ``experiments()`` + ``data_sets()`` enumeration that instantiates a + ``DataSet`` object per run. It is therefore well suited for listing the + contents of databases with many runs. The (potentially large) snapshot of + each run is deliberately not read, as it would slow down building the + overview significantly. + + The reported number of ``records`` is a best-effort estimate of the number + of data points in a run and may be less precise than + ``DataSet.number_of_results``: + + * For completed runs the shape information stored in the run description is + preferred (it is the authoritative final count), falling back to the + number of rows in the results table. + * For runs that are still in progress the number of rows in the results + table is preferred (it grows as data is added), falling back to the run + description shapes. + * If neither is available the count is reported as ``0`` (unknown). + + Only one of ``path_to_db`` and ``conn`` should be supplied. If a + ``path_to_db`` is given the database is opened in read-only mode and the + connection is closed again before returning. + + Args: + path_to_db: Path to the database file. Opened read-only if given. + conn: An existing connection to use instead of ``path_to_db``. It is + left open by this function. + start_run_id: Only return runs whose ``run_id`` is strictly greater + than this value. Use ``0`` (the default) to get all runs, or pass + the last known ``run_id`` to fetch only newly added runs. + extra_columns: Names of additional ``runs``-table columns to include in + each :class:`RunOverviewDict`. Columns that do not exist in the + ``runs`` table of the given database are silently skipped. This is + useful for reading ad-hoc metadata columns added via + ``DataSet.add_metadata``. + + Returns: + A dictionary mapping ``run_id`` to a :class:`RunOverviewDict`. + + """ + overview: dict[int, RunOverviewDict] = {} + + created_conn = conn is None + connection = conn_from_dbpath_or_conn( + conn=conn, path_to_db=path_to_db, read_only=True + ) + manager = closing(connection) if created_conn else nullcontext(connection) + + with manager as c: + valid_extra_columns = [ + col for col in (extra_columns or []) if is_column_in_table(c, "runs", col) + ] + extra_select = "".join(f", r.{col}" for col in valid_extra_columns) + + # ``run_description`` is queried to derive the record count for + # completed runs; the (potentially large) snapshot is deliberately + # excluded. + query = f""" + SELECT r.run_id, e.name, e.sample_name, r.name, + r.run_timestamp, r.completed_timestamp, + r.guid, r.result_table_name, + r.run_description{extra_select} + FROM runs r + JOIN experiments e ON r.exp_id = e.exp_id + WHERE r.run_id > ? + ORDER BY r.run_id + """ + + try: + rows = c.execute(query, (start_run_id,)).fetchall() + except sqlite3.Error as e: + log.warning("Could not query database overview: %s", e) + return overview + + # ``result_counter`` in the runs table is the run's ordinal within its + # experiment, not a data-point count, so it is not usable here. For the + # ``array`` paramtype a single INSERT can also contain many data points. + # The real number of data points is therefore the row count of the + # results table, queried separately. + result_tables = {row[7] for row in rows if row[7]} + row_counts: dict[str, int] = {} + for table in result_tables: + try: + (count,) = c.execute(f'SELECT COUNT(*) FROM "{table}"').fetchone() + except sqlite3.Error: + continue # results table may not exist (yet) + row_counts[table] = count + + n_fixed = 9 # number of columns selected before ``extra_columns`` + for row in rows: + run_id = row[0] + started_date, started_time = _format_timestamp(row[4]) + completed_date, completed_time = _format_timestamp(row[5]) + result_table = row[7] or "" + is_completed = row[5] is not None and row[5] != 0 + + # The record count is a best-effort data-point count. For completed + # runs the run-description shapes are the authoritative final count; + # for in-progress runs the live results-table row count is preferred + # as it grows while data is added. ``0`` means "unknown". + if is_completed: + records = _records_from_run_description(row[8]) + if records == 0: + records = row_counts.get(result_table, 0) + else: + records = row_counts.get(result_table, 0) + if records == 0: + records = _records_from_run_description(row[8]) + + entry: RunOverviewDict = { + "run_id": run_id, + "experiment": row[1] or "", + "sample": row[2] or "", + "name": row[3] or "", + "started_date": started_date, + "started_time": started_time, + "completed_date": completed_date, + "completed_time": completed_time, + "records": records, + "guid": row[6] or "", + } + if valid_extra_columns: + extra = { + col: row[n_fixed + i] for i, col in enumerate(valid_extra_columns) + } + # The keys of ``extra`` are only known at runtime (they are the + # user-supplied ``extra_columns``), so they cannot be part of + # the closed ``RunOverviewDict`` definition. + entry.update(extra) # type: ignore[typeddict-item] + + overview[run_id] = entry + + return overview diff --git a/tests/dataset/test_db_overview.py b/tests/dataset/test_db_overview.py new file mode 100644 index 000000000000..5e4fdaf63bf2 --- /dev/null +++ b/tests/dataset/test_db_overview.py @@ -0,0 +1,229 @@ +"""Tests for :mod:`qcodes.dataset.sqlite.db_overview`.""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING + +import pytest + +from qcodes.dataset import ( + get_db_overview, + load_or_create_experiment, + new_data_set, +) +from qcodes.dataset.descriptions.dependencies import InterDependencies_ +from qcodes.dataset.sqlite.connection import path_to_dbfile +from qcodes.dataset.sqlite.database import connect +from qcodes.dataset.sqlite.db_overview import ( + _format_timestamp, + _records_from_run_description, +) +from qcodes.parameters import ParamSpecBase + +if TYPE_CHECKING: + from pathlib import Path + + from qcodes.dataset.sqlite.connection import AtomicConnection + + +@pytest.fixture(name="db_conn") +def _db_conn(tmp_path: Path, request: pytest.FixtureRequest) -> AtomicConnection: + """Connection to a fresh temporary database. + + An explicit connection object is used rather than + ``initialise_or_create_database_at`` so that the global QCoDeS config is + left untouched. The connection is closed through a finalizer so that it is + cleaned up even if the test fails. + """ + conn = connect(str(tmp_path / "overview.db")) + request.addfinalizer(conn.close) + return conn + + +def _create_runs( + conn: AtomicConnection, + *, + n_runs: int = 1, + n_points: int = 10, + experiment_name: str = "test_exp", + sample_name: str = "test_sample", +) -> None: + """Create ``n_runs`` simple numeric runs on ``conn``.""" + exp = load_or_create_experiment(experiment_name, sample_name=sample_name, conn=conn) + p_x = ParamSpecBase("x", "numeric") + p_y = ParamSpecBase("y", "numeric") + interdeps = InterDependencies_(dependencies={p_y: (p_x,)}) + + for r in range(n_runs): + ds = new_data_set(f"run_{r + 1}", exp_id=exp.exp_id, conn=conn) + ds.set_interdependencies(interdeps) + ds.mark_started() + for i in range(n_points): + ds.add_results([{p_x.name: float(i), p_y.name: float(i**2)}]) + ds.mark_completed() + + +def test_records_from_run_description() -> None: + desc = json.dumps({"version": 3, "shapes": {"dep1": [100, 50]}}) + assert _records_from_run_description(desc) == 5000 + + multi = json.dumps({"shapes": {"dep1": [10], "dep2": [5, 4]}}) + assert _records_from_run_description(multi) == 30 + + assert _records_from_run_description(json.dumps({"version": 3})) == 0 + assert _records_from_run_description(json.dumps({"shapes": {}})) == 0 + assert _records_from_run_description(None) == 0 + assert _records_from_run_description("") == 0 + assert _records_from_run_description("not valid json") == 0 + + +def test_format_timestamp() -> None: + assert _format_timestamp(None) == ("", "") + assert _format_timestamp(0) == ("", "") + # a value that cannot be converted to a datetime is handled gracefully + assert _format_timestamp(float("nan")) == ("", "") + + date, time = _format_timestamp(1_600_000_000.0) + assert len(date) == len("YYYY-MM-DD") + assert date.count("-") == 2 + assert len(time) == len("HH:MM:SS") + assert time.count(":") == 2 + + +def test_get_db_overview_basic_fields(db_conn: AtomicConnection) -> None: + _create_runs(db_conn, n_runs=2) + + overview = get_db_overview(path_to_dbfile(db_conn)) + + assert set(overview.keys()) == {1, 2} + for run_id, info in overview.items(): + assert info["run_id"] == run_id + assert info["experiment"] == "test_exp" + assert info["sample"] == "test_sample" + assert info["name"] == f"run_{run_id}" + assert info["guid"] + assert info["started_date"] and info["started_time"] + assert info["completed_date"] and info["completed_time"] + + +def test_get_db_overview_counts_result_rows(db_conn: AtomicConnection) -> None: + _create_runs(db_conn, n_runs=3, n_points=10) + + overview = get_db_overview(conn=db_conn) + + for run_id, info in overview.items(): + (table_name,) = db_conn.execute( + "SELECT result_table_name FROM runs WHERE run_id=?", (run_id,) + ).fetchone() + (actual,) = db_conn.execute(f'SELECT COUNT(*) FROM "{table_name}"').fetchone() + assert info["records"] == actual == 10 + + +def test_get_db_overview_incremental(db_conn: AtomicConnection) -> None: + _create_runs(db_conn, n_runs=2) + path = path_to_dbfile(db_conn) + + assert set(get_db_overview(path).keys()) == {1, 2} + assert get_db_overview(path, start_run_id=2) == {} + + _create_runs(db_conn, n_runs=1, experiment_name="test_exp2", sample_name="s2") + + incremental = get_db_overview(path, start_run_id=2) + assert set(incremental.keys()) == {3} + assert incremental[3]["experiment"] == "test_exp2" + + +def test_get_db_overview_extra_columns(db_conn: AtomicConnection) -> None: + exp = load_or_create_experiment("exp", sample_name="sample", conn=db_conn) + p_x = ParamSpecBase("x", "numeric") + p_y = ParamSpecBase("y", "numeric") + interdeps = InterDependencies_(dependencies={p_y: (p_x,)}) + ds = new_data_set("tagged_run", exp_id=exp.exp_id, conn=db_conn) + ds.set_interdependencies(interdeps) + ds.mark_started() + ds.add_results([{p_x.name: 1.0, p_y.name: 2.0}]) + ds.mark_completed() + ds.add_metadata("my_tag", "hello") + + # An existing ad-hoc metadata column is returned ... + overview = get_db_overview(conn=db_conn, extra_columns=["my_tag"]) + assert overview[1]["my_tag"] == "hello" # type: ignore[typeddict-item] + + # ... while a non-existent column is silently skipped. + overview = get_db_overview(conn=db_conn, extra_columns=["does_not_exist"]) + assert "does_not_exist" not in overview[1] + + +def test_get_db_overview_accepts_connection(db_conn: AtomicConnection) -> None: + _create_runs(db_conn, n_runs=1) + + overview = get_db_overview(conn=db_conn) + assert set(overview.keys()) == {1} + # the externally supplied connection must remain usable + assert db_conn.execute("SELECT COUNT(*) FROM runs").fetchone()[0] == 1 + + +def test_get_db_overview_rejects_conn_and_path(db_conn: AtomicConnection) -> None: + _create_runs(db_conn, n_runs=1) + + with pytest.raises(ValueError): + get_db_overview(path_to_db=path_to_dbfile(db_conn), conn=db_conn) + + +def test_get_db_overview_in_progress_uses_live_row_count( + db_conn: AtomicConnection, +) -> None: + exp = load_or_create_experiment("exp", sample_name="sample", conn=db_conn) + p_x = ParamSpecBase("x", "numeric") + p_y = ParamSpecBase("y", "numeric") + interdeps = InterDependencies_(dependencies={p_y: (p_x,)}) + + # run 1: started, still in progress, with 5 data points + ds_live = new_data_set("live", exp_id=exp.exp_id, conn=db_conn) + ds_live.set_interdependencies(interdeps) + ds_live.mark_started() + for i in range(5): + ds_live.add_results([{p_x.name: float(i), p_y.name: float(i)}]) + # run 2: started, in progress, no data yet + ds_empty = new_data_set("empty", exp_id=exp.exp_id, conn=db_conn) + ds_empty.set_interdependencies(interdeps) + ds_empty.mark_started() + + overview = get_db_overview(conn=db_conn) + + # in-progress runs report the live results-table row count and have no + # completed timestamp + assert overview[1]["records"] == 5 + assert overview[1]["completed_date"] == "" + assert overview[1]["completed_time"] == "" + assert overview[2]["records"] == 0 + + +def test_get_db_overview_missing_results_table_reports_zero( + db_conn: AtomicConnection, +) -> None: + _create_runs(db_conn, n_runs=1, n_points=5) + + (table_name,) = db_conn.execute( + "SELECT result_table_name FROM runs WHERE run_id=1" + ).fetchone() + db_conn.execute(f'DROP TABLE "{table_name}"') + db_conn.commit() + + # with the results table gone (and no shape info) the count is unknown; + # ``result_counter`` is the run ordinal, not a data-point count, so it must + # not be used as a fallback + overview = get_db_overview(conn=db_conn) + assert overview[1]["records"] == 0 + + +def test_get_db_overview_query_error_returns_empty( + db_conn: AtomicConnection, +) -> None: + _create_runs(db_conn, n_runs=1) + + # remove a table the overview query depends on so the JOIN fails + db_conn.execute("DROP TABLE experiments") + db_conn.commit() + assert get_db_overview(conn=db_conn) == {}