Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
services:
database:
profiles: ["python", "php", "all"]
image: "openml/test-database:20240105"
image: "openml/test-database:v0.1.20260204"
container_name: "openml-test-database"
environment:
MYSQL_ROOT_PASSWORD: ok
Expand Down
5 changes: 4 additions & 1 deletion src/database/users.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
from sqlalchemy import Connection, text

# Enforces str is 32 hexadecimal characters, does not check validity.
Comment thread
SubhadityaMukherjee marked this conversation as resolved.
Comment thread
PGijsbers marked this conversation as resolved.
APIKey = Annotated[str, StringConstraints(pattern=r"^[0-9a-fA-F]{32}$")]
APIKey = Annotated[
str,
StringConstraints(pattern=r"^([0-9a-fA-F]{32})|(abc)|(normaluser)|(normaluser2)$"),
Comment thread
sourcery-ai[bot] marked this conversation as resolved.
Outdated
Comment thread
sourcery-ai[bot] marked this conversation as resolved.
Outdated
]
Comment thread
coderabbitai[bot] marked this conversation as resolved.


class UserGroup(IntEnum):
Expand Down
4 changes: 2 additions & 2 deletions tests/constants.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
PRIVATE_DATASET_ID = {130}
IN_PREPARATION_ID = {33}
IN_PREPARATION_ID = {33, 161, 162, 163}
DEACTIVATED_DATASETS = {131}
DATASETS = set(range(1, 132))
DATASETS = set(range(1, 132)) | {161, 162, 163}

NUMBER_OF_DATASETS = len(DATASETS)
NUMBER_OF_DEACTIVATED_DATASETS = len(DEACTIVATED_DATASETS)
Expand Down
21 changes: 12 additions & 9 deletions tests/routers/openml/datasets_list_datasets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_list_filter_active(status: str, amount: int, py_api: TestClient) -> Non
("api_key", "amount"),
[
(ApiKey.ADMIN, constants.NUMBER_OF_DATASETS),
(ApiKey.OWNER_USER, constants.NUMBER_OF_DATASETS),
(ApiKey.DATASET_130_OWNER, constants.NUMBER_OF_DATASETS),
(ApiKey.SOME_USER, constants.NUMBER_OF_DATASETS - constants.NUMBER_OF_PRIVATE_DATASETS),
(None, constants.NUMBER_OF_DATASETS - constants.NUMBER_OF_PRIVATE_DATASETS),
],
Expand Down Expand Up @@ -91,13 +91,15 @@ def test_list_data_name_absent(name: str, py_api: TestClient) -> None:


@pytest.mark.parametrize("limit", [None, 5, 10, 200])
@pytest.mark.parametrize("offset", [None, 0, 5, 129, 130, 200])
@pytest.mark.parametrize("offset", [None, 0, 5, 129, 140, 200])
def test_list_pagination(limit: int | None, offset: int | None, py_api: TestClient) -> None:
# dataset ids are contiguous until 131, then there are 161, 162, and 163.
extra_datasets = [161, 162, 163]
all_ids = [
did
for did in range(1, 1 + constants.NUMBER_OF_DATASETS)
for did in range(1, 1 + constants.NUMBER_OF_DATASETS - len(extra_datasets))
if did not in constants.PRIVATE_DATASET_ID
]
] + extra_datasets

start = 0 if offset is None else offset
end = start + (100 if limit is None else limit)
Expand All @@ -108,7 +110,7 @@ def test_list_pagination(limit: int | None, offset: int | None, py_api: TestClie
filters = {"status": "all", "pagination": offset_body | limit_body}
response = py_api.post("/datasets/list", json=filters)

if offset in [130, 200]:
if offset in [140, 200]:
_assert_empty_result(response)
return

Expand All @@ -119,7 +121,7 @@ def test_list_pagination(limit: int | None, offset: int | None, py_api: TestClie

@pytest.mark.parametrize(
("version", "count"),
[(1, 100), (2, 6), (5, 1)],
[(1, 100), (2, 7), (5, 1)],
)
def test_list_data_version(version: int, count: int, py_api: TestClient) -> None:
response = py_api.post(
Expand All @@ -133,16 +135,17 @@ def test_list_data_version(version: int, count: int, py_api: TestClient) -> None


def test_list_data_version_no_result(py_api: TestClient) -> None:
version_with_no_datasets = 42
response = py_api.post(
f"/datasets/list?api_key={ApiKey.ADMIN}",
json={"status": "all", "data_version": 4},
json={"status": "all", "data_version": version_with_no_datasets},
)
_assert_empty_result(response)


@pytest.mark.parametrize(
"key",
[ApiKey.SOME_USER, ApiKey.OWNER_USER, ApiKey.ADMIN],
[ApiKey.SOME_USER, ApiKey.DATASET_130_OWNER, ApiKey.ADMIN],
)
@pytest.mark.parametrize(
("user_id", "count"),
Expand Down Expand Up @@ -211,7 +214,7 @@ def test_list_data_tag_empty(py_api: TestClient) -> None:
("number_classes", "2", 51),
("number_classes", "2..3", 56),
("number_missing_values", "2", 1),
("number_missing_values", "2..100000", 22),
("number_missing_values", "2..100000", 23),
],
)
def test_list_data_quality(quality: str, range_: str, count: int, py_api: TestClient) -> None:
Expand Down
6 changes: 3 additions & 3 deletions tests/routers/openml/datasets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from routers.openml.datasets import get_dataset
from schemas.datasets.openml import DatasetMetadata, DatasetStatus
from tests import constants
from tests.users import ADMIN_USER, NO_USER, OWNER_USER, SOME_USER, ApiKey
from tests.users import ADMIN_USER, DATASET_130_OWNER, NO_USER, SOME_USER, ApiKey


@pytest.mark.parametrize(
Expand Down Expand Up @@ -92,7 +92,7 @@ def test_private_dataset_no_access(


@pytest.mark.parametrize(
"user", [OWNER_USER, ADMIN_USER, pytest.param(SOME_USER, marks=pytest.mark.xfail)]
"user", [DATASET_130_OWNER, ADMIN_USER, pytest.param(SOME_USER, marks=pytest.mark.xfail)]
)
def test_private_dataset_access(user: User, expdb_test: Connection, user_test: Connection) -> None:
dataset = get_dataset(
Expand Down Expand Up @@ -165,7 +165,7 @@ def test_dataset_features_no_access(py_api: TestClient) -> None:

@pytest.mark.parametrize(
"api_key",
[ApiKey.ADMIN, ApiKey.OWNER_USER],
[ApiKey.ADMIN, ApiKey.DATASET_130_OWNER],
)
def test_dataset_features_access_to_private(api_key: ApiKey, py_api: TestClient) -> None:
response = py_api.get(f"/datasets/features/130?api_key={api_key}")
Expand Down
7 changes: 5 additions & 2 deletions tests/routers/openml/migration/datasets_migration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def test_private_dataset_no_user_no_access(

@pytest.mark.parametrize(
"api_key",
[ApiKey.OWNER_USER, ApiKey.ADMIN],
[ApiKey.DATASET_130_OWNER, ApiKey.ADMIN],
)
def test_private_dataset_owner_access(
py_api: TestClient,
Expand Down Expand Up @@ -225,4 +225,7 @@ def test_datasets_feature_is_identical(
else:
# The old API formats bool as string in lower-case
feature[key] = str(value) if not isinstance(value, bool) else str(value).lower()
assert python_body == original.json()["data_features"]["feature"]
Comment thread
PGijsbers marked this conversation as resolved.
original_features = original.json()["data_features"]["feature"]
for feature in original_features:
feature.pop("ontology", None)
assert python_body == original_features
26 changes: 14 additions & 12 deletions tests/routers/openml/study_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from starlette.testclient import TestClient

from schemas.study import StudyType
from tests.users import ApiKey


def test_get_task_study_by_id(py_api: TestClient) -> None:
Expand Down Expand Up @@ -458,7 +459,7 @@ def test_get_task_study_by_alias(py_api: TestClient) -> None:

def test_create_task_study(py_api: TestClient) -> None:
response = py_api.post(
"/studies?api_key=00000000000000000000000000000000",
f"/studies?api_key={ApiKey.SOME_USER}",
json={
"name": "Test Study",
"alias": "test-study",
Expand Down Expand Up @@ -518,27 +519,28 @@ def _attach_tasks_to_study(


def test_attach_task_to_study(py_api: TestClient, expdb_test: Connection) -> None:
expdb_test.execute(text("UPDATE study SET status = 'in_preparation' WHERE id = 7"))
response = _attach_tasks_to_study(
study_id=1,
task_ids=[2, 3, 4],
api_key="AD000000000000000000000000000000",
study_id=7,
task_ids=[50],
api_key=ApiKey.OWNER_USER,
Comment thread
PGijsbers marked this conversation as resolved.
py_api=py_api,
expdb_test=expdb_test,
)
assert response.status_code == HTTPStatus.OK
assert response.json() == {"study_id": 1, "main_entity_type": StudyType.TASK}
assert response.status_code == HTTPStatus.OK, response.content
assert response.json() == {"study_id": 7, "main_entity_type": StudyType.TASK}


def test_attach_task_to_study_needs_owner(py_api: TestClient, expdb_test: Connection) -> None:
expdb_test.execute(text("UPDATE study SET status = 'in_preparation' WHERE id = 1"))
expdb_test.execute(text("UPDATE study SET status = 'in_preparation' WHERE id = 7"))
response = _attach_tasks_to_study(
study_id=1,
task_ids=[2, 3, 4],
api_key="00000000000000000000000000000000",
api_key=ApiKey.OWNER_USER,
py_api=py_api,
expdb_test=expdb_test,
)
assert response.status_code == HTTPStatus.FORBIDDEN
assert response.status_code == HTTPStatus.FORBIDDEN, response.content


def test_attach_task_to_study_already_linked_raises(
Expand All @@ -549,11 +551,11 @@ def test_attach_task_to_study_already_linked_raises(
response = _attach_tasks_to_study(
study_id=1,
task_ids=[1, 3, 4],
api_key="AD000000000000000000000000000000",
api_key=ApiKey.ADMIN,
py_api=py_api,
expdb_test=expdb_test,
)
assert response.status_code == HTTPStatus.CONFLICT
assert response.status_code == HTTPStatus.CONFLICT, response.content
assert response.json() == {"detail": "Task 1 is already attached to study 1."}


Expand All @@ -565,7 +567,7 @@ def test_attach_task_to_study_but_task_not_exist_raises(
response = _attach_tasks_to_study(
study_id=1,
task_ids=[80123, 78914],
api_key="AD000000000000000000000000000000",
api_key=ApiKey.ADMIN,
py_api=py_api,
expdb_test=expdb_test,
)
Expand Down
2 changes: 1 addition & 1 deletion tests/routers/openml/users_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def test_fetch_user(api_key: str, user: User, user_test: Connection) -> None:
db_user = fetch_user(api_key, user_data=user_test)
assert db_user is not None
assert user.user_id == db_user.user_id
assert user.groups == db_user.groups
assert set(user.groups) == set(db_user.groups)


def test_fetch_user_invalid_key_returns_none(user_test: Connection) -> None:
Expand Down
12 changes: 7 additions & 5 deletions tests/users.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@

NO_USER = None
SOME_USER = User(user_id=2, _database=None, _groups=[UserGroup.READ_WRITE])
OWNER_USER = User(user_id=16, _database=None, _groups=[UserGroup.READ_WRITE])
ADMIN_USER = User(user_id=1, _database=None, _groups=[UserGroup.ADMIN, UserGroup.READ_WRITE])
OWNER_USER = User(user_id=3229, _database=None, _groups=[UserGroup.READ_WRITE])
DATASET_130_OWNER = User(user_id=16, _database=None, _groups=[UserGroup.READ_WRITE])
ADMIN_USER = User(user_id=1159, _database=None, _groups=[UserGroup.ADMIN, UserGroup.READ_WRITE])


class ApiKey(StrEnum):
ADMIN = "AD000000000000000000000000000000"
SOME_USER = "00000000000000000000000000000000"
OWNER_USER = "DA1A0000000000000000000000000000"
ADMIN = "abc"
SOME_USER = "normaluser2"
OWNER_USER = "normaluser"
DATASET_130_OWNER = "DA1A0000000000000000000000000000"
Comment on lines 12 to +16
Copy link
Copy Markdown
Contributor

@coderabbitai coderabbitai Bot Feb 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

Non-hex API keys drive the production validation change — consider updating the test DB instead.

The literal keys "abc", "normaluser", and "normaluser2" don't conform to the hex-format expected by the production APIKey validator, which is why src/database/users.py had to be weakened. A cleaner approach would be to update the session_hash values for these users in the test database image to use 32-character hex strings, keeping production validation strict.

🤖 Prompt for AI Agents
In `@tests/users.py` around lines 12 - 16, The test ApiKey literals (ApiKey.ADMIN,
ApiKey.SOME_USER, ApiKey.OWNER_USER) are non-hex and forced a relaxation of the
production APIKey validator; instead, update the test DB image so the users'
session_hash values are 32-character hex strings and restore the strict
validator in src/database/users.py; specifically, change the test DB records for
users corresponding to "abc"/"normaluser"/"normaluser2" to use valid 32-char hex
session_hash values, revert any weakened validation logic, and keep the
ApiKey.StrEnum values matching those hex session_hashes (or leave the enum alone
if it references test keys) so production validation remains strict.

Copy link
Copy Markdown
Contributor Author

@PGijsbers PGijsbers Feb 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, which is why I had constructed the previous test database keys that way. But the new keys were chosen by @josvandervelde and are now used in both openml-python and openml java. We need to coordinate changing the keys again, so I am not updating the image for this PR.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like the humans are having a chat. I'll hop back into my burrow for now. If you need me again, just tag @coderabbitai in a new comment, and I'll come hopping out!

INVALID = "11111111111111111111111111111111"