From 3ece726b1b188abddddf918db64ab5b4ece78d10 Mon Sep 17 00:00:00 2001 From: Alena Rybakina Date: Sat, 13 Jun 2026 15:46:16 +0300 Subject: [PATCH] orca: fall back for replicated CTE consumed in correlated scalar subqueries Root cause is a blind spot in the #375 slice walker: CollectCTESlices delimits slices only at Motion nodes. A CTE over a DISTRIBUTED REPLICATED table referenced from correlated scalar subqueries is decorrelated by ORCA into CPhysicalCorrelated*NLJoin whose inner side becomes an executor SubPlan running in its own slice -- but there is no Motion at that boundary, so the walker placed the Consumer on the same slice as the Producer. The cross-slice check (prod->sliceId != cons->sliceId) never fired, no fallback happened, and the ShareInputScan writer hung forever in shareinput_writer_waitdone() waiting for DONE acks from reader slices that never run. Teach the walker that the inner (subquery) side of a correlated NL join is a slice boundary too, mirroring the Motion rule. The replicated Consumer in the SubPlan then gets a distinct slice id, the existing check fires, and ORCA falls back to the Postgres optimizer. --- .../gporca/libgpopt/src/base/CUtils.cpp | 33 ++++++++++++++++--- src/test/regress/expected/shared_scan.out | 32 ++++++++++++++++++ .../expected/shared_scan_optimizer.out | 32 ++++++++++++++++++ src/test/regress/sql/shared_scan.sql | 29 ++++++++++++++++ 4 files changed, 122 insertions(+), 4 deletions(-) diff --git a/src/backend/gporca/libgpopt/src/base/CUtils.cpp b/src/backend/gporca/libgpopt/src/base/CUtils.cpp index d114a639449..66fa9c1db56 100644 --- a/src/backend/gporca/libgpopt/src/base/CUtils.cpp +++ b/src/backend/gporca/libgpopt/src/base/CUtils.cpp @@ -922,10 +922,30 @@ struct SCTEInfo typedef CDynamicPtrArray > CTEInfoArray; +// True if the operator is a correlated NL join. Its inner side becomes +// an executor SubPlan that runs in its own slice, so a CTE Consumer +// there is cross-slice w.r.t. a Producer outside -- which can deadlock +// the ShareInputScan writer. We treat the inner side as a slice +// boundary so the check below catches it. +// +// These are all of ORCA's SubPlan-producing operators. Add new ones here. +static BOOL +FCorrelatedNLJoin(COperator *pop) +{ + COperator::EOperatorId eopid = pop->Eopid(); + return (COperator::EopPhysicalCorrelatedInnerNLJoin == eopid || + COperator::EopPhysicalCorrelatedLeftOuterNLJoin == eopid || + COperator::EopPhysicalCorrelatedLeftSemiNLJoin == eopid || + COperator::EopPhysicalCorrelatedInLeftSemiNLJoin == eopid || + COperator::EopPhysicalCorrelatedLeftAntiSemiNLJoin == eopid || + COperator::EopPhysicalCorrelatedNotInLeftAntiSemiNLJoin == eopid); +} + // Walk the physical tree, recording the slice id of every replicated // CTE Producer and every CTE Consumer. Slices are delimited by Motion -// nodes: each non-scalar child of a Motion lives in a fresh slice -- -// same motId-stack idea as in apply_shareinput_xslice. +// nodes and by the SubPlan (inner) side of correlated NL joins: each +// such non-scalar child lives in a fresh slice -- same motId-stack idea +// as in apply_shareinput_xslice. static void CollectCTESlices(CMemoryPool *mp, CExpression *pexpr, ULONG curSlice, ULONG *pNextSlice, CTEInfoArray *prodInfos, @@ -958,6 +978,7 @@ CollectCTESlices(CMemoryPool *mp, CExpression *pexpr, ULONG curSlice, } BOOL isMotion = CUtils::FPhysicalMotion(pop); + BOOL isCorrelatedNLJoin = FCorrelatedNLJoin(pop); for (ULONG ul = 0; ul < pexpr->Arity(); ul++) { @@ -971,9 +992,13 @@ CollectCTESlices(CMemoryPool *mp, CExpression *pexpr, ULONG curSlice, } // Allocate a fresh slice id for each non-scalar child of a - // Motion; otherwise the child stays in the parent's slice. + // Motion, and for the inner (subquery) side of a correlated NL + // join -- which the executor materializes as a SubPlan running + // in its own slice. Otherwise the child stays in the parent's + // slice. (For a NL join child 0 is the outer relation and child + // 1 is the inner/subquery relation.) ULONG childSlice = curSlice; - if (isMotion) + if (isMotion || (isCorrelatedNLJoin && 1 == ul)) { (*pNextSlice)++; childSlice = *pNextSlice; diff --git a/src/test/regress/expected/shared_scan.out b/src/test/regress/expected/shared_scan.out index 1bb16e8a465..75d6f6413a6 100644 --- a/src/test/regress/expected/shared_scan.out +++ b/src/test/regress/expected/shared_scan.out @@ -278,6 +278,38 @@ WITH RESET statement_timeout; DROP TABLE ss_t1, ss_t2; +-- ORCA should also fall back when the replicated CTE is referenced from +-- *correlated* scalar subqueries. These become correlated NL joins whose +-- inner side runs as a SubPlan in its own slice. Counting only Motion +-- nodes misses this, so the ShareInputScan writer hangs waiting for a +-- DONE ack from the reader slice. The walk must treat the correlated-join +-- inner side as a separate slice. +CREATE TABLE ss_c2 (id numeric, refrcode varchar(255), referenceid numeric) + DISTRIBUTED REPLICATED; +CREATE TABLE ss_c1 (id bigint, iscalctrg varchar(15) NOT NULL, + iscalcdetail varchar(15)) + DISTRIBUTED REPLICATED; +INSERT INTO ss_c2 SELECT i, 'A'||(i%5), 101991 FROM generate_series(1, 50000) i; +INSERT INTO ss_c1 + SELECT i, 'A'||(i%5), 'A'||(i%7) FROM generate_series(1, 50000) i; +ANALYZE ss_c1; +ANALYZE ss_c2; +SET statement_timeout = '15s'; +WITH cte AS ( + SELECT id, refrcode FROM ss_c2 WHERE referenceid = 101991 AND id < 25000 + UNION ALL + SELECT id, refrcode FROM ss_c2 WHERE referenceid = 101991 AND id >= 25000 +) + SELECT (SELECT refrcode FROM cte WHERE refrcode = p.iscalctrg LIMIT 1) = 'A1' + AND (SELECT refrcode FROM cte WHERE refrcode = p.iscalcdetail LIMIT 1) = 'A1' AS ok + FROM ss_c1 p WHERE p.id = 1; + ok +---- + t +(1 row) + +RESET statement_timeout; +DROP TABLE ss_c1, ss_c2; -- Test the scenario which already opened many fds -- start_ignore RESET search_path; diff --git a/src/test/regress/expected/shared_scan_optimizer.out b/src/test/regress/expected/shared_scan_optimizer.out index 56919a5fcb4..fbfb446a260 100644 --- a/src/test/regress/expected/shared_scan_optimizer.out +++ b/src/test/regress/expected/shared_scan_optimizer.out @@ -291,6 +291,38 @@ WITH RESET statement_timeout; DROP TABLE ss_t1, ss_t2; +-- ORCA should also fall back when the replicated CTE is referenced from +-- *correlated* scalar subqueries. These become correlated NL joins whose +-- inner side runs as a SubPlan in its own slice. Counting only Motion +-- nodes misses this, so the ShareInputScan writer hangs waiting for a +-- DONE ack from the reader slice. The walk must treat the correlated-join +-- inner side as a separate slice. +CREATE TABLE ss_c2 (id numeric, refrcode varchar(255), referenceid numeric) + DISTRIBUTED REPLICATED; +CREATE TABLE ss_c1 (id bigint, iscalctrg varchar(15) NOT NULL, + iscalcdetail varchar(15)) + DISTRIBUTED REPLICATED; +INSERT INTO ss_c2 SELECT i, 'A'||(i%5), 101991 FROM generate_series(1, 50000) i; +INSERT INTO ss_c1 + SELECT i, 'A'||(i%5), 'A'||(i%7) FROM generate_series(1, 50000) i; +ANALYZE ss_c1; +ANALYZE ss_c2; +SET statement_timeout = '15s'; +WITH cte AS ( + SELECT id, refrcode FROM ss_c2 WHERE referenceid = 101991 AND id < 25000 + UNION ALL + SELECT id, refrcode FROM ss_c2 WHERE referenceid = 101991 AND id >= 25000 +) + SELECT (SELECT refrcode FROM cte WHERE refrcode = p.iscalctrg LIMIT 1) = 'A1' + AND (SELECT refrcode FROM cte WHERE refrcode = p.iscalcdetail LIMIT 1) = 'A1' AS ok + FROM ss_c1 p WHERE p.id = 1; + ok +---- + t +(1 row) + +RESET statement_timeout; +DROP TABLE ss_c1, ss_c2; -- Test the scenario which already opened many fds -- start_ignore RESET search_path; diff --git a/src/test/regress/sql/shared_scan.sql b/src/test/regress/sql/shared_scan.sql index df2d21faf2d..e8b1223b3dd 100644 --- a/src/test/regress/sql/shared_scan.sql +++ b/src/test/regress/sql/shared_scan.sql @@ -150,6 +150,35 @@ WITH RESET statement_timeout; DROP TABLE ss_t1, ss_t2; +-- ORCA should also fall back when the replicated CTE is referenced from +-- *correlated* scalar subqueries. These become correlated NL joins whose +-- inner side runs as a SubPlan in its own slice. Counting only Motion +-- nodes misses this, so the ShareInputScan writer hangs waiting for a +-- DONE ack from the reader slice. The walk must treat the correlated-join +-- inner side as a separate slice. +CREATE TABLE ss_c2 (id numeric, refrcode varchar(255), referenceid numeric) + DISTRIBUTED REPLICATED; +CREATE TABLE ss_c1 (id bigint, iscalctrg varchar(15) NOT NULL, + iscalcdetail varchar(15)) + DISTRIBUTED REPLICATED; +INSERT INTO ss_c2 SELECT i, 'A'||(i%5), 101991 FROM generate_series(1, 50000) i; +INSERT INTO ss_c1 + SELECT i, 'A'||(i%5), 'A'||(i%7) FROM generate_series(1, 50000) i; +ANALYZE ss_c1; +ANALYZE ss_c2; + +SET statement_timeout = '15s'; +WITH cte AS ( + SELECT id, refrcode FROM ss_c2 WHERE referenceid = 101991 AND id < 25000 + UNION ALL + SELECT id, refrcode FROM ss_c2 WHERE referenceid = 101991 AND id >= 25000 +) + SELECT (SELECT refrcode FROM cte WHERE refrcode = p.iscalctrg LIMIT 1) = 'A1' + AND (SELECT refrcode FROM cte WHERE refrcode = p.iscalcdetail LIMIT 1) = 'A1' AS ok + FROM ss_c1 p WHERE p.id = 1; +RESET statement_timeout; +DROP TABLE ss_c1, ss_c2; + -- Test the scenario which already opened many fds -- start_ignore RESET search_path;