From 80d7cef04c0843723c53f69692978820ae5db404 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Wed, 6 May 2026 18:05:37 +0000 Subject: [PATCH] Fix missing value handling in LAION-400M dataset metadata extraction. Missing values from Pandas were being extracted as `NaN` and evaluated as truthy, preventing them from being correctly replaced with an empty string and failing validation during serialization. Now checking with `pd.isna` instead to correctly extract empty strings or the designated missing value placeholder. Co-authored-by: tomvdw <4160552+tomvdw@users.noreply.github.com> --- .../vision_language/laion400m/laion400m.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tensorflow_datasets/vision_language/laion400m/laion400m.py b/tensorflow_datasets/vision_language/laion400m/laion400m.py index 546702796f2..a5e32c9f271 100644 --- a/tensorflow_datasets/vision_language/laion400m/laion400m.py +++ b/tensorflow_datasets/vision_language/laion400m/laion400m.py @@ -78,15 +78,19 @@ def _get_example_metadata(metadata_df_row): """Returns example metadata.""" + pd = tfds.core.lazy_imports.pandas nsfw_tag = metadata_df_row['NSFW'] if nsfw_tag not in _NSFW_TAGS: nsfw_tag = _NSFW_MISSING_TAG + similarity = metadata_df_row['similarity'] + license_ = metadata_df_row['LICENSE'] + return { 'caption': metadata_df_row['caption'], 'nsfw': nsfw_tag, - 'similarity': metadata_df_row['similarity'] or _MISSING_SIMILARITY_VALUE, - 'license': metadata_df_row['LICENSE'] or '', + 'similarity': _MISSING_SIMILARITY_VALUE if pd.isna(similarity) else similarity, + 'license': '' if pd.isna(license_) else license_, 'url': metadata_df_row['url'], 'original_width': metadata_df_row['original_width'], 'original_height': metadata_df_row['original_height'],