Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/multi-omics-submission-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:

# Paths for MARS repo and ISA template
REPOSITORY_SERVICES_PATH: ${{ github.workspace }}/MARS/repository-services
ISA_TEMPLATE_PATH: ${{ github.workspace }}/MARS/test-data/biosamples-input-isa.json
ISA_TEMPLATE_PATH: ${{ github.workspace }}/MARS/test-data/biosamples-input-isa-multi.json

# Credentials from GitHub secrets
WEBIN_USERNAME: ${{ secrets.WEBIN_USERNAME }}
Expand Down
2 changes: 1 addition & 1 deletion MARS
Submodule MARS updated 18 files
+7 −31 ...isajson-ena/src/main/java/com/elixir/biohackaton/ISAToSRA/controller/WebinIsaToXmlSubmissionController.java
+149 −4 ...tory-services/isajson-ena/src/main/java/com/elixir/biohackaton/ISAToSRA/sra/service/MarsReceiptService.java
+414 −162 ...rvices/isajson-ena/src/main/java/com/elixir/biohackaton/ISAToSRA/sra/service/WebinExperimentXmlCreator.java
+0 −50 ...-services/isajson-ena/src/main/java/com/elixir/biohackaton/ISAToSRA/sra/service/WebinProjectXmlCreator.java
+77 −30 ...tory-services/isajson-ena/src/main/java/com/elixir/biohackaton/ISAToSRA/sra/service/WebinRunXmlCreator.java
+169 −55 ...ry-services/isajson-ena/src/main/java/com/elixir/biohackaton/ISAToSRA/sra/service/WebinStudyXmlCreator.java
+28 −0 ...itory-services/isajson-ena/src/test/java/com/elixir/biohackaton/ISAToSRA/WebinExperimentXmlCreatorTest.java
+40 −12 repository-services/isajson-ena/src/test/java/com/elixir/biohackaton/ISAToSRA/WebinRunXmlCreatorTest.java
+60 −4 repository-services/receipt/src/main/java/com/elixir/biohackaton/ISAToSRA/receipt/MarsReceiptProvider.java
+1,617 −0 test-data/biosamples-input-isa-multi.json
+126 −78 test-data/biosamples-input-isa.json
+81 −17 test-data/biosamples-modified-isa.json
+2 −2 test-data/biosamples-original-isa-no-accesion-char.json
+87 −23 test-data/biosamples-original-isa.json
+3 −3 test-data/ena-receipt-invalid.json
+2 −2 test-data/ena-receipt.json
+2 −2 test-data/mars-ena-receipt-invalid.json
+8 −1 test-data/mars-ena-receipt.json
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -404,5 +404,3 @@ python mars_cli.py --credential-service-name metabolights --username-credential

[To set up and run the MARS tool locally using Docker, follow these steps](../repository-services/README.md)



46 changes: 25 additions & 21 deletions mars_lib/isa_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,29 +484,26 @@ def update_isa_json(isa_json: IsaJson, repo_response: RepositoryResponse) -> Isa

add_accession_to_data_file_node(updated_node, accession.value)
else:
# Add study accession to study comments
updated_study = apply_filter(study_filter, investigation.studies)

study_accession_comment: Comment = Comment(
accession_comment = Comment(
name=f"{target_repository}_{target_level}_accession",
value=accession.value,
)
updated_study.comments.append(study_accession_comment)

# Add study accession to assay comments
updated_assay = next(
filter(
lambda assay: is_assay_for_target_repo(assay, target_repository),
updated_study.assays,
),
None,
)
if updated_assay:
assay_accession_comment: Comment = Comment(
name=f"{target_repository}_{target_level}_accession",
value=accession.value,

if target_level == "study":
updated_study.comments.append(accession_comment)
else:
updated_assay = next(
filter(
lambda assay: is_assay_for_target_repo(
assay, target_repository
),
updated_study.assays,
),
None,
)
updated_assay.comments.append(assay_accession_comment)
if updated_assay:
updated_assay.comments.append(accession_comment)
isa_json.investigation = investigation
return isa_json

Expand All @@ -525,10 +522,13 @@ def map_data_files_to_repositories(
for assay in assays:
target_repo_comment: Comment = detect_target_repo_comment(assay.comments)
# This is an effect of everything being optional in the Comment model.
# Should we decide to make the value mandatory, this guard clause would not be necessary anymore.
# Should we decide to make the value mandatory, this guard clause
# would not be necessary anymore.
if target_repo_comment.value is None:
raise ValueError(
f"At least one assay in the ISA-JSON has no '{TARGET_REPO_KEY}' comment. Mapping not possible. Make sure all assays in the ISA-JSON have this comment!"
f"At least one assay in the ISA-JSON has no "
f"'{TARGET_REPO_KEY}' comment. Mapping not possible. "
f"Make sure all assays in the ISA-JSON have this comment!"
)
assay_data_files = [df.name for df in assay.dataFiles]

Expand All @@ -555,7 +555,11 @@ def map_data_files_to_repositories(

[
print_and_log(
msg=f"File '{rf['short_name']}' could not be mapped to any data file in the ISA-JSON. For this reason, it will be skipped during submission!",
msg=(
f"File '{rf['short_name']}' could not be mapped to any data "
f"file in the ISA-JSON. For this reason, it will be skipped "
f"during submission!"
),
level="warning",
)
for rf in remaining_files
Expand Down
118 changes: 58 additions & 60 deletions scripts/isa_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,34 +40,33 @@ def _md5_of_file(path: Path) -> str:
return h.hexdigest()


def _get_first_assay(isa_obj: dict[str, Any]) -> dict[str, Any] | None:
def _get_all_assays(isa_obj: dict[str, Any]) -> List[dict[str, Any]]:
"""
Navigate to investigation.studies[0].assays[0] (if present).
Return all assays found under investigation.studies[*].assays[*].
"""
inv = isa_obj.get("investigation")
if inv is None:
inv = isa_obj

if not isinstance(inv, dict):
return None
return []

studies = inv.get("studies") or []
if not isinstance(studies, list) or not studies:
return None
if not isinstance(studies, list):
return []

first_study = studies[0]
if not isinstance(first_study, dict):
return None
assays: List[dict[str, Any]] = []
for study in studies:
if not isinstance(study, dict):
continue

assays = first_study.get("assays") or []
if not isinstance(assays, list) or not assays:
return None
study_assays = study.get("assays") or []
if not isinstance(study_assays, list):
continue

first_assay = assays[0]
if not isinstance(first_assay, dict):
return None
assays.extend(assay for assay in study_assays if isinstance(assay, dict))

return first_assay
return assays


def _ensure_comment(comments: List[dict[str, Any]], name: str, value: str) -> None:
Expand All @@ -83,22 +82,22 @@ def _ensure_comment(comments: List[dict[str, Any]], name: str, value: str) -> No


def _update_datafiles_with_generated_files(
assay: dict[str, Any],
assays: List[dict[str, Any]],
data_dir: Path,
n_files: int,
n_files: int | None,
) -> List[Path]:
"""
For the first assay, update its dataFiles entries with newly generated .fastq.gz files.
Update assay dataFiles entries with newly generated .fastq.gz files.

Behaviour per dataFiles[i] (for i < n_files):
Behaviour per touched data file:

- Generate a unique .fastq.gz file based on the existing 'name':
e.g. ENA_TEST2.R2.fastq.gz -> ENA_TEST2.R2_<suffix>.fastq.gz
(if name doesn't end with .fastq.gz, just append _<suffix>.fastq.gz)

- Write a dummy FASTQ into that file and compute its MD5.

- Update the dataFiles[i] object:
- Update the dataFiles entry:
* "name" = new file name
* in "comments":
- "file name" -> new file name
Expand All @@ -107,66 +106,65 @@ def _update_datafiles_with_generated_files(
- "checksum_method" -> "MD5"
(existing "accession", "submission date", etc. are kept as-is)
"""
data_files_json = assay.get("dataFiles") or []
if not isinstance(data_files_json, list):
return []

generated_paths: List[Path] = []
suffix = _timestamp_suffix()
updated_count = 0

# We only touch up to n_files entries, and only those that look like objects
for i, df_json in enumerate(data_files_json):
if i >= n_files:
break
if not isinstance(df_json, dict):
for assay in assays:
data_files_json = assay.get("dataFiles") or []
if not isinstance(data_files_json, list):
continue

original_name = df_json.get("name")
if not isinstance(original_name, str) or not original_name:
continue
for df_json in data_files_json:
if n_files is not None and updated_count >= n_files:
return generated_paths
if not isinstance(df_json, dict):
continue

original_name = df_json.get("name")
if not isinstance(original_name, str) or not original_name:
continue

# Build unique .fastq.gz name
if original_name.endswith(".fastq.gz"):
base = original_name[:-len(".fastq.gz")]
new_name = f"{base}_{suffix}.fastq.gz"
else:
new_name = f"{original_name}_{suffix}.fastq.gz"
if original_name.endswith(".fastq.gz"):
base = original_name[:-len(".fastq.gz")]
new_name = f"{base}_{suffix}.fastq.gz"
else:
new_name = f"{original_name}_{suffix}.fastq.gz"

file_path = data_dir / new_name
_write_dummy_fastq_gz(file_path)
md5 = _md5_of_file(file_path)
file_path = data_dir / new_name
_write_dummy_fastq_gz(file_path)
md5 = _md5_of_file(file_path)

# Update the JSON entry
df_json["name"] = new_name
df_json["name"] = new_name

comments = df_json.get("comments")
if not isinstance(comments, list):
comments = []
df_json["comments"] = comments
comments = df_json.get("comments")
if not isinstance(comments, list):
comments = []
df_json["comments"] = comments

_ensure_comment(comments, "file name", new_name)
_ensure_comment(comments, "file type", "fastq")
_ensure_comment(comments, "file checksum", md5)
_ensure_comment(comments, "checksum_method", "MD5")
# DO NOT touch 'accession' or 'submission date' if present
_ensure_comment(comments, "file name", new_name)
_ensure_comment(comments, "file type", "fastq")
_ensure_comment(comments, "file checksum", md5)
_ensure_comment(comments, "checksum_method", "MD5")

generated_paths.append(file_path)
generated_paths.append(file_path)
updated_count += 1

return generated_paths


def generate_isa_json_with_data(
work_dir: Path,
template_path: Path,
n_files: int = 2,
n_files: int | None = None,
) -> Tuple[Path, List[Path]]:
"""
PoC behaviour:

1. Load ISA-JSON template from template_path.
2. Find investigation.studies[0].assays[0].dataFiles.
3. For up to n_files entries in dataFiles, generate UNIQUE .fastq.gz files
and update:
2. Find all investigation.studies[*].assays[*].dataFiles.
3. For each data file (or the first n_files when limited), generate UNIQUE
.fastq.gz files and update:
- dataFiles[i]["name"]
- dataFiles[i]["comments"] entries for file name, type, checksum, method.
4. Write the resulting ISA-JSON to work_dir / 'isa.json'.
Expand All @@ -177,12 +175,12 @@ def generate_isa_json_with_data(

isa_obj = json.loads(template_path.read_text())

assay = _get_first_assay(isa_obj)
assays = _get_all_assays(isa_obj)
generated_paths: List[Path] = []
if assay is not None:
if assays:
data_dir = work_dir / "data"
generated_paths = _update_datafiles_with_generated_files(
assay=assay,
assays=assays,
data_dir=data_dir,
n_files=n_files,
)
Expand Down
1 change: 0 additions & 1 deletion scripts/prepare_poc_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ def main() -> None:
isa_path, data_files = generate_isa_json_with_data(
work_dir=work_dir,
template_path=isa_template,
n_files=2,
)

cred_path = write_credentials_json(work_dir)
Expand Down
4 changes: 2 additions & 2 deletions tests/fixtures/mars_receipts/biosamples_success_response.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"key": "studies",
"where": {
"key": "title",
"value": "Arabidopsis thaliana"
"value": "Integrated multi-omics profiling of Arabidopsis thaliana under controlled experimental conditions"
}
},
{
Expand All @@ -38,7 +38,7 @@
"key": "studies",
"where": {
"key": "title",
"value": "Arabidopsis thaliana"
"value": "Integrated multi-omics profiling of Arabidopsis thaliana under controlled experimental conditions"
}
},
{
Expand Down
15 changes: 9 additions & 6 deletions tests/test_isa_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@ def test_load_isa_json():
# Should test the validation process of the ISA JSON file where root has 'investigation' as key.
valid_isa_json02 = load_isa_json("MARS/test-data/biosamples-input-isa.json", False)
assert len(valid_isa_json02.investigation.studies) == 1
assert valid_isa_json02.investigation.studies[0].title == "Arabidopsis thaliana"
assert (
valid_isa_json02.investigation.studies[0].title
== "Integrated multi-omics profiling of Arabidopsis thaliana under controlled experimental conditions"
)

with pytest.raises(ValidationError):
load_isa_json("./tests/fixtures/invalid_investigation.json", True)
Expand Down Expand Up @@ -245,7 +248,7 @@ def test_update_study_materials_with_accession_categories():
)


def test_update_study_and_assay_with_ena_study_accession_comment():
def test_update_study_only_with_ena_study_accession_comment():
json_path = "tests/fixtures/isa_jsons/1_after_biosamples.json"
isa_json = load_isa_json(json_path, False)
response_file_path = "tests/fixtures/mars_receipts/ena_success_response.json"
Expand All @@ -267,10 +270,10 @@ def test_update_study_and_assay_with_ena_study_accession_comment():
None,
)
assay_comments = ena_assay.comments
accession_comment = filter(
lambda x: x.name == "ena_study_accession", assay_comments
)
assert next(accession_comment).value == ena_study_accession_number
accession_comments = [
comment for comment in assay_comments if comment.name == "ena_study_accession"
]
assert accession_comments == []


def test_update_datafile_comment_with_accession_comment_present():
Expand Down
Loading