rapidsai · quasiben · Apr 14, 2026 · Apr 16, 2026 · Apr 17, 2026 · Apr 17, 2026
@@ -247,15 +247,33 @@ def assert_tpch_result_equal(
 
         # We know that each dataframe is sorted on `sort_by` according to itself.
         # Now we have some freedom to reorder the rows. We'll use this freedom to avoid
-        # any kind of sorting on floating-point columns, which introduces all sorts of
-        # fuzziness we don't want to deal with.
+        # any kind of fuzziness from sorting on floating-point columns.
+        #
+        # As long as we sort by the non-float columns *first*, we'll avoid any
+        # false positives / false negatives from comparing two tables that have the
+        # same values but happen to be in a different order. Sorting by floating-point
+        # columns *last* ensures that records that are close to each other appear in
+        # (roughly) the same order, such that polar's approximate equality checks
+        # will allow them to be considered equal (or not, if the aren't actually close).
         non_float_columns = [
             col
             for col in left.columns
             if left.schema[col] not in (pl.Float32, pl.Float64)
         ]
-        left_sorted = left.sort(by=non_float_columns, nulls_last=nulls_last)
-        right_sorted = right.sort(by=non_float_columns, nulls_last=nulls_last)
+        float_columns = [
+            col for col in left.columns if left.schema[col] in (pl.Float32, pl.Float64)
+        ]
+        grouped_sort_columns = [*non_float_columns, *float_columns]
+
+        def sort_for_comparison(df: pl.DataFrame) -> pl.DataFrame:
+            return (
+                df.sort(by=grouped_sort_columns, nulls_last=nulls_last)
+                if grouped_sort_columns
+                else df
+            )
+
+        left_sorted = sort_for_comparison(left)
+        right_sorted = sort_for_comparison(right)
 
         if limit is None or left.is_empty():
             try:
@@ -320,8 +338,8 @@ def assert_tpch_result_equal(
 
             try:
                 polars.testing.assert_frame_equal(
-                    result_first.sort(by=non_float_columns, nulls_last=nulls_last),
-                    expected_first.sort(by=non_float_columns, nulls_last=nulls_last),
+                    sort_for_comparison(result_first),
+                    sort_for_comparison(expected_first),
                     **polars_kwargs,  # type: ignore[arg-type]
                 )
             except AssertionError as e:
@@ -339,12 +357,8 @@ def assert_tpch_result_equal(
 
             try:
                 polars.testing.assert_frame_equal(
-                    result_ties.sort(non_float_columns, nulls_last=nulls_last).select(
-                        by
-                    ),
-                    expected_ties.sort(non_float_columns, nulls_last=nulls_last).select(
-                        by
-                    ),
+                    sort_for_comparison(result_ties).select(by),
+                    sort_for_comparison(expected_ties).select(by),
                     **polars_kwargs,  # type: ignore[arg-type]
                 )
             except AssertionError as e:
@@ -354,11 +368,31 @@ def assert_tpch_result_equal(
                 ) from e
 
     else:
-        # no sort_by, just a straight comparison.
+        non_float_columns = [
+            col
+            for col in left.columns
+            if left.schema[col] not in (pl.Float32, pl.Float64)
+        ]
+        float_columns = [
+            col for col in left.columns if left.schema[col] in (pl.Float32, pl.Float64)
+        ]
+        grouped_sort_columns = [*non_float_columns, *float_columns]
+        left_sorted = (
+            left.sort(by=grouped_sort_columns, nulls_last=nulls_last)
+            if grouped_sort_columns
+            else left
+        )
+        right_sorted = (
+            right.sort(by=grouped_sort_columns, nulls_last=nulls_last)
+            if grouped_sort_columns
+            else right
+        )
+
+        # no sort_by, compare after grouped sort to ignore nondeterministic row order.
         try:
             polars.testing.assert_frame_equal(
-                left,
-                right,
+                left_sorted,
+                right_sorted,
                 **polars_kwargs,  # type: ignore[arg-type]
             )
         except AssertionError as e:

@@ -335,65 +335,66 @@ def polars_impl(run_config: RunConfig) -> QueryResult:
     item = get_data(run_config.dataset_path, "item", run_config.suffix)
     date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
 
+    cross_items = build_cross_items(
+        store_sales, catalog_sales, web_sales, item, date_dim, year=year
+    )
+    average_sales = build_average_sales(
+        store_sales, catalog_sales, web_sales, date_dim, year=year
+    )
+
+    # week_dates is ≤7 rows (one calendar week), computed once as a 1-partition frame.
+    # Push the week filter into each channel before the UNION via a semi-join so that
+    # ~99% of rows (everything outside the target week) are dropped before the
+    # expensive cross_items join and groupby.
+    target_week = (
+        date_dim.filter(
+            (pl.col("d_year") == year + 1)
+            & (pl.col("d_moy") == 12)
+            & (pl.col("d_dom") == day)
+        )
+        .select("d_week_seq")
+        .unique()
+    )
+    week_dates = date_dim.join(target_week, on="d_week_seq").select("d_date_sk")
+
     all_sales = pl.concat(
         [
-            store_sales.select(
+            store_sales.join(
+                week_dates, left_on="ss_sold_date_sk", right_on="d_date_sk", how="semi"
+            ).select(
                 [
                     pl.lit("store").alias("channel"),
                     pl.col("ss_item_sk").alias("item_sk"),
                     pl.col("ss_quantity").alias("quantity"),
                     pl.col("ss_list_price").alias("list_price"),
-                    pl.col("ss_sold_date_sk").alias("date_sk"),
                 ]
             ),
-            catalog_sales.select(
+            catalog_sales.join(
+                week_dates, left_on="cs_sold_date_sk", right_on="d_date_sk", how="semi"
+            ).select(
                 [
                     pl.lit("catalog").alias("channel"),
                     pl.col("cs_item_sk").alias("item_sk"),
                     pl.col("cs_quantity").alias("quantity"),
                     pl.col("cs_list_price").alias("list_price"),
-                    pl.col("cs_sold_date_sk").alias("date_sk"),
                 ]
             ),
-            web_sales.select(
+            web_sales.join(
+                week_dates, left_on="ws_sold_date_sk", right_on="d_date_sk", how="semi"
+            ).select(
                 [
                     pl.lit("web").alias("channel"),
                     pl.col("ws_item_sk").alias("item_sk"),
                     pl.col("ws_quantity").alias("quantity"),
                     pl.col("ws_list_price").alias("list_price"),
-                    pl.col("ws_sold_date_sk").alias("date_sk"),
                 ]
             ),
         ]
     )
 
-    cross_items = build_cross_items(
-        store_sales, catalog_sales, web_sales, item, date_dim, year=year
-    )
-    average_sales = build_average_sales(
-        store_sales, catalog_sales, web_sales, date_dim, year=year
-    )
-
-    # d_week_seq target is the same for all 3 channels; compute it once.
-    target_week = (
-        date_dim.filter(
-            (pl.col("d_year") == year + 1)
-            & (pl.col("d_moy") == 12)
-            & (pl.col("d_dom") == day)
-        )
-        .select("d_week_seq")
-        .unique()
-    )
-    week_dates = date_dim.join(target_week, on="d_week_seq").select("d_date_sk")
-
-    # Build y: all 3 channels in a single pipeline.
-    # cross_items and average_sales each appear once — no CSE needed.
-    # After group_by the frame is tiny, so the cross join with the 1-row
-    # average_sales frame is negligible even if Polars fuses it into an IEJoin.
     y = (
         all_sales.join(cross_items, left_on="item_sk", right_on="ss_item_sk")
         .join(item, left_on="item_sk", right_on="i_item_sk")
-        .join(week_dates, left_on="date_sk", right_on="d_date_sk")
         .group_by(["channel", "i_brand_id", "i_class_id", "i_category_id"])
         .agg(
             [

@@ -110,38 +110,103 @@ def polars_impl(run_config: RunConfig) -> QueryResult:
     sort_by = {"i_item_id": False, "i_item_desc": False, "s_state": False}
     limit = 100
 
-    store_sales_base = (
+    q1 = f"{year}Q1"
+    q1_q3 = [f"{year}Q1", f"{year}Q2", f"{year}Q3"]
+
+    # Pre-filter date_dim to only qualifying d_date_sk values.
+    d1_dates = date_dim.filter(pl.col("d_quarter_name") == q1).select("d_date_sk")
+    d_q3_dates = date_dim.filter(pl.col("d_quarter_name").is_in(q1_q3)).select(
+        "d_date_sk"
+    )
+
+    # store_returns has [6] partitions — at the broadcast limit. Filter it to Q1-Q3 dates
+    # first, then use the (customer, item) pairs it contains to pre-filter both store_sales
+    # and catalog_sales before those larger tables enter the expensive shuffle joins.
+    store_returns_filtered = store_returns.join(
+        d_q3_dates, left_on="sr_returned_date_sk", right_on="d_date_sk", how="semi"
+    ).select(["sr_customer_sk", "sr_item_sk", "sr_ticket_number", "sr_return_quantity"])
+
+    # (customer, item) pairs present in any qualifying store return; stays at [6] partitions
+    # so broadcast is free. Polars will CACHE this shared subplan.
+    sr_customer_item = store_returns_filtered.select(["sr_customer_sk", "sr_item_sk"])
+
+    store_sales_filtered = (
         store_sales.join(
-            date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk", suffix="_d1"
+            d1_dates, left_on="ss_sold_date_sk", right_on="d_date_sk", how="semi"
+        )
+        .join(
+            sr_customer_item,
+            left_on=["ss_customer_sk", "ss_item_sk"],
+            right_on=["sr_customer_sk", "sr_item_sk"],
+            how="semi",
+        )
+        .select(
+            [
+                "ss_customer_sk",
+                "ss_item_sk",
+                "ss_store_sk",
+                "ss_ticket_number",
+                "ss_quantity",
+            ]
+        )
+        .join(
+            item.select(["i_item_sk", "i_item_id", "i_item_desc"]),
+            left_on="ss_item_sk",
+            right_on="i_item_sk",
+        )
+        .join(
+            store.select(["s_store_sk", "s_state"]),
+            left_on="ss_store_sk",
+            right_on="s_store_sk",
+        )
+        .select(
+            [
+                "ss_customer_sk",
+                "ss_item_sk",
+                "ss_ticket_number",
+                "ss_quantity",
+                "i_item_id",
+                "i_item_desc",
+                "s_state",
+            ]
         )
-        .join(item, left_on="ss_item_sk", right_on="i_item_sk")
-        .join(store, left_on="ss_store_sk", right_on="s_store_sk")
-        .filter(pl.col("d_quarter_name") == f"{year}Q1")
     )
 
-    store_returns_base = store_returns.join(
-        date_dim, left_on="sr_returned_date_sk", right_on="d_date_sk", suffix="_d2"
-    ).filter(pl.col("d_quarter_name").is_in([f"{year}Q1", f"{year}Q2", f"{year}Q3"]))
-
-    catalog_sales_base = catalog_sales.join(
-        date_dim, left_on="cs_sold_date_sk", right_on="d_date_sk", suffix="_d3"
-    ).filter(pl.col("d_quarter_name").is_in([f"{year}Q1", f"{year}Q2", f"{year}Q3"]))
+    catalog_sales_filtered = (
+        catalog_sales.join(
+            d_q3_dates, left_on="cs_sold_date_sk", right_on="d_date_sk", how="semi"
+        )
+        .join(
+            sr_customer_item,
+            left_on=["cs_bill_customer_sk", "cs_item_sk"],
+            right_on=["sr_customer_sk", "sr_item_sk"],
+            how="semi",
+        )
+        .select(["cs_bill_customer_sk", "cs_item_sk", "cs_quantity"])
+    )
 
     return QueryResult(
         frame=(
-            store_sales_base.join(
-                store_returns_base,
+            store_sales_filtered.join(
+                store_returns_filtered,
                 left_on=["ss_customer_sk", "ss_item_sk", "ss_ticket_number"],
                 right_on=["sr_customer_sk", "sr_item_sk", "sr_ticket_number"],
-                how="inner",
-                suffix="_sr",
+            )
+            .select(
+                [
+                    "ss_customer_sk",
+                    "ss_item_sk",
+                    "ss_quantity",
+                    "sr_return_quantity",
+                    "i_item_id",
+                    "i_item_desc",
+                    "s_state",
+                ]
             )
             .join(
-                catalog_sales_base,
+                catalog_sales_filtered,
                 left_on=["ss_customer_sk", "ss_item_sk"],
                 right_on=["cs_bill_customer_sk", "cs_item_sk"],
-                how="inner",
-                suffix="_cs",
             )
             .group_by(["i_item_id", "i_item_desc", "s_state"])
             .agg(