Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions docs/release_notes/flare_280.rst
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,22 @@ Compatibility and Migration Notes
- The deprecated FLAdminAPI surface has been removed. Use the FLARE API,
Recipe environments, and ``nvflare`` CLI workflows for new automation.
- HA/Overseer code has been removed from the 2.8 branch.
- CellPipe cell names now keep the runtime token and pipe mode in one
explicitly marked FQCN leaf segment (``site-1.cellpipe-<job-id>_active``,
or ``<relay>.cellpipe-alias-<site>_<job-id>_active`` behind a relay) so a
pipe cell's FQCN parent matches the cell it actually connects to and pipe
names can never be confused with other cell names. As part of this change,
CellPipe validates tokens at construction: tokens must be non-empty, may
not start with ``alias-``, may not contain ``.`` when the pipe connects to
the site's own CP or a relay, and may not contain ``_`` or ``.`` when
connected through a relay. Custom ``FlareAgentWithCellPipe`` agent ids that
violate these rules now fail fast with a ``ValueError`` instead of
producing unroutable cell names.
- Both ends of a CellPipe pair derive each other's cell names independently,
so a Client Job process and an external training process must run the same
NVFlare naming scheme. A training environment pinned to an older NVFlare
fails with "peer FQCN mismatch" when paired with a 2.8 CJ; align the
training environment's NVFlare version with the site's.

See the :ref:`migration_guide` for additional API and configuration migration
notes.
Expand Down
5 changes: 4 additions & 1 deletion nvflare/app_common/ccwf/client_ctl.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,10 @@ def _do_learn(self):
try:
self.do_learn_task(t.task_name, t.task_data, t.fl_ctx, t.abort_signal)
except:
self.logger.log(f"exception from do_learn_task: {secure_format_traceback()}")
self.log_exception(t.fl_ctx, "exception from do_learn_task")
# report the failure to the server so the job ends with an
# error status instead of FINISHED:COMPLETED
self.update_status(action="do_learn_task", error=ReturnCode.EXECUTION_EXCEPTION)
finally:
# force garbage collection
gc.collect()
Expand Down
13 changes: 8 additions & 5 deletions nvflare/fuel/f3/cellnet/core_cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ def _set_bb_for_client_child(self, parent_url: str, create_internal_listener: bo
self._create_bb_external_connector()
elif not parent_url and self.root_url:
# A cell configured with only a root URL (e.g. a CellPipe cell named
# <site>.<token>.<mode> that joins the cellnet at the root) has no
# <site>.cellpipe-<token>_<mode> that joins the cellnet at the root) has no
# other way to connect, regardless of its generation.
self._create_bb_external_connector()

Expand Down Expand Up @@ -1178,10 +1178,13 @@ def _try_find_ep(self, target_fqcn: str, for_msg: Message) -> Union[None, Endpoi
agent = self.agents.get(parent_fqcn)
if agent:
return agent.endpoint
# I'm not connected to my FQCN parent: cells with hierarchical
# names (e.g. CellPipe cells named <site>.<token>.<mode>) connect
# to an ancestor or to the root instead. Fall through to the
# generic resolution below.
# I'm not connected to my FQCN parent: some hierarchical cells
# connect to an ancestor or to the root instead (e.g. CellPipe
# cells named <site>.cellpipe-<token>_<mode> that connect to
# the server root). This fall-through is load-bearing for such cells - see
# test_pipe_cell_reaches_peer_through_server_root in
# core_cell_routing_test.py. Fall through to the generic
# resolution below.
self.logger.debug(f"{self.my_info.fqcn}: no connection to parent {parent_fqcn}")

# not the same family, or no direct path within the family
Expand Down
71 changes: 71 additions & 0 deletions nvflare/fuel/f3/cellnet/fqcn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,84 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional, Tuple

from nvflare.fuel.common.fqn import FQN


class FQCN(FQN):
pass


# CellPipe cells use explicitly marked leaf segments so pipe names can never
# be confused with other cell names or with each other:
# - plain leaf "cellpipe-<token>_<mode>" for pipes connected to the server
# root or to the site's own CP;
# - alias leaf "cellpipe-alias-<owner>_<token>_<mode>" for pipes connected
# through another cell (e.g. a relay). The alias maps the cell to the
# owning site for mTLS identity resolution and stream message
# authentication. Both directions of the alias grammar live here so they
# cannot drift apart.
#
# CellPipe cell-name schemes, in historical order:
# 1. flat (pre-2.7): the whole FQCN is "<site>_<token>_<mode>", a root-level
# sibling of the site cell.
# 2. hierarchical (#4801, never released): "<site>.<token>.<mode>". Replaced
# because the extra segments created unconnected FQCN parents that broke
# routing (NVBug 6371056).
# 3. topology (current): a single prefixed leaf segment under the FQCN of
# the cell the pipe actually connects to,
# "<parent>.cellpipe-<token>_<mode>", or
# "<relay_fqcn>.cellpipe-alias-<owner>_<token>_<mode>" when connected
# through another cell.
# Mixed-version notes: scheme-1 aliases are still accepted by identity
# resolution and stream auth (as whole-FQCN aliases via the bare grammar),
# but the two ends of one pipe pair must run the same scheme - each end
# derives the peer's name from its own code, so a CJ and a training
# subprocess on different schemes fail with "peer FQCN mismatch".
CELL_PIPE_LEAF_PREFIX = "cellpipe-"
CELL_PIPE_ALIAS_PREFIX = "cellpipe-alias-"
CELL_PIPE_ALIAS_MODES = ("active", "passive")


def make_cell_pipe_alias(owner: str, runtime_id: str, mode: str) -> str:
return f"{CELL_PIPE_ALIAS_PREFIX}{owner}_{runtime_id}_{mode}"


def parse_cell_pipe_alias(segment: str) -> Optional[Tuple[str, str, str]]:
"""Parse a CellPipe alias leaf segment into (owner, runtime_id, mode).

Two shapes are accepted:
- the current explicit form "cellpipe-alias-<owner>_<runtime_id>_<mode>";
- the bare legacy form "<owner>_<runtime_id>_<mode>" used by pre-2.8
flat CellPipe names, where the whole FQCN is the alias. Callers decide
where the bare form is acceptable; it is normally restricted to
single-segment FQCNs so an unmarked "<token>_<mode>" leaf inside a
longer FQCN is never misread as an alias.

In both shapes the runtime_id must be non-empty and contain no "." or
"_": parsing from the right makes the interpretation unambiguous, so
"site-a_x_<uuid>_active" can only belong to "site-a_x", never to "site-a"
with a runtime id of "x_<uuid>".

Returns None if the segment is not a valid alias.
"""
if segment.startswith(CELL_PIPE_ALIAS_PREFIX):
segment = segment[len(CELL_PIPE_ALIAS_PREFIX) :]

head, sep, mode = segment.rpartition("_")
if not sep or mode not in CELL_PIPE_ALIAS_MODES:
return None

# rpartition splits on the last "_", so runtime_id can never contain "_";
# only the "." constraint needs an explicit check.
owner, sep, runtime_id = head.rpartition("_")
if not sep or not owner or not runtime_id or "." in runtime_id:
return None

return owner, runtime_id, mode


class FqcnInfo:
def __init__(self, fqcn: str):
self.fqcn = fqcn
Expand Down
60 changes: 31 additions & 29 deletions nvflare/fuel/f3/cellnet/identity.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from cryptography.x509.oid import NameOID

from nvflare.apis.fl_constant import ConnectionSecurity
from nvflare.fuel.f3.cellnet.fqcn import FQCN
from nvflare.fuel.f3.cellnet.fqcn import CELL_PIPE_ALIAS_PREFIX, CELL_PIPE_LEAF_PREFIX, FQCN, parse_cell_pipe_alias
from nvflare.fuel.f3.drivers.driver_params import DriverParams
from nvflare.fuel.f3.drivers.net_utils import SECURE_SCHEMES
from nvflare.fuel.utils.admin_name_utils import is_valid_admin_client_name
Expand Down Expand Up @@ -126,27 +126,13 @@ def _resolve_local_child_identity(self, fqcn: str) -> Optional[str]:

@staticmethod
def _get_cell_pipe_alias_owner(segment: str) -> Optional[str]:
# CellPipe cells from older NVFlare versions use sibling names like
# "site-1_<runtime-id>_active" but authenticate with the owning site's
# certificate. Current versions name these cells <site>.<token>.<mode>,
# which resolves through the normal FQCN hierarchy; this parser is kept
# for backward compatibility with peers running older versions. Only the
# constrained form <owner>_<runtime_id>_(active|passive) with a non-empty
# runtime_id that contains no "." or "_" is treated as an alias: parsing
# from the right makes the interpretation unambiguous, so
# "site-a_x_<uuid>_active" can only belong to "site-a_x", never to
# "site-a" with a runtime id of "x_<uuid>".
head, sep, mode = segment.rpartition("_")
if not sep or mode not in ("active", "passive"):
return None

# rpartition splits on the last "_", so runtime_id can never contain "_";
# only the "." constraint needs an explicit check.
owner, sep, runtime_id = head.rpartition("_")
if not sep or not owner or not runtime_id or "." in runtime_id:
return None

return owner
# CellPipe cells connected through another cell use an alias leaf like
# "cellpipe-alias-site-1_<runtime-id>_active" but authenticate with
# the owning site's certificate. Older NVFlare versions used the bare
# sibling form "site-1_<runtime-id>_active" as the whole FQCN. See
# parse_cell_pipe_alias for the alias grammar.
parsed = parse_cell_pipe_alias(segment)
return parsed[0] if parsed else None

def resolve(self, fqcn: str) -> Optional[str]:
if not fqcn:
Expand All @@ -167,13 +153,29 @@ def resolve(self, fqcn: str) -> Optional[str]:
if identity:
return identity

# This legacy-alias check intentionally precedes _resolve_local_child_identity:
# an old-format CellPipe alias cell may connect as a direct child of this
# local cell, but it authenticates with the owning site's certificate, not
# with a certificate named after the alias segment itself.
alias_owner = self._get_cell_pipe_alias_owner(parts[-1]) if parts else None
if alias_owner:
return self.resolve(alias_owner)
# The alias interpretation applies to the two shapes that carry an
# alias: the explicitly marked leaf "cellpipe-alias-..." at any depth,
# or a legacy (pre-2.8 flat naming) cell whose whole FQCN is the bare
# alias. An unmarked "<token>_<mode>"-shaped leaf inside a longer FQCN
# is never an alias - its token may itself contain "_" (e.g.
# "site-1.cellpipe-ext_trainer_active"), and alias-parsing it would
# fabricate a wrong owner such as "ext". This check intentionally
# precedes _resolve_local_child_identity: an alias cell connected as a
# direct child of this local cell authenticates with the owning site's
# certificate, not with a certificate named after the alias segment.
leaf = parts[-1]
if leaf.startswith(CELL_PIPE_ALIAS_PREFIX) or len(parts) == 1:
alias_owner = self._get_cell_pipe_alias_owner(leaf)
if alias_owner:
return self.resolve(alias_owner)

# A plain CellPipe leaf ("cellpipe-<token>_<mode>") authenticates with
# the identity of the cell it is named under, so resolve its parent.
# This must also precede _resolve_local_child_identity: a CP resolving
# its own pipe child expects the site's identity, not one named after
# the leaf segment.
if len(parts) > 1 and leaf.startswith(CELL_PIPE_LEAF_PREFIX):
return self.resolve(FQCN.join(parts[:-1]))

identity = self._resolve_local_child_identity(fqcn)
if identity:
Expand Down
76 changes: 67 additions & 9 deletions nvflare/fuel/utils/pipe/cell_pipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import queue
import threading
import time
Expand All @@ -22,7 +23,7 @@
from nvflare.fuel.f3.cellnet.cell import Cell
from nvflare.fuel.f3.cellnet.cell import Message as CellMessage
from nvflare.fuel.f3.cellnet.defs import MessageHeaderKey, ReturnCode
from nvflare.fuel.f3.cellnet.fqcn import FQCN
from nvflare.fuel.f3.cellnet.fqcn import CELL_PIPE_LEAF_PREFIX, FQCN, make_cell_pipe_alias
from nvflare.fuel.f3.cellnet.net_agent import NetAgent
from nvflare.fuel.f3.cellnet.utils import make_reply
from nvflare.fuel.f3.drivers.driver_params import DriverParams
Expand All @@ -44,24 +45,81 @@
_HEADER_START_TIME = _PREFIX + "start"
_HEADER_HB_SEQ = _PREFIX + "hb_seq"

_logger = logging.getLogger(__name__)


def _cell_fqcn(mode, site_name, token, parent_fqcn):
# The FQCN of the cell must be unique in the whole cellnet.
# The cell is named <site_name>.<token>.<mode>, scoped under the FQCN of the
# cell it connects to. The hierarchical form keeps the owning site as a
# leading segment, so mTLS identity resolution and message routing follow
# the normal FQCN rules without any alias parsing.
# The runtime token and pipe mode are kept in one leaf segment to avoid
# introducing an unconnected FQCN parent such as <site>.<token>.
# When connecting to the site's own CP or to a relay, the FQCN parent is
# the actually connected cell, so normal parent routing applies. When
# connecting to the server root (e.g. simulator, or pipes configured with
# a root url), the cell is named <site>.<leaf> while physically connected
# to the root: its FQCN parent <site> is NOT connected, and routing relies
# on the fall-through in CoreCell._try_find_ep() that resolves the next
# leg through the root when the FQCN parent is absent. That fall-through
# is load-bearing for these cells (covered by
# test_pipe_cell_reaches_peer_through_server_root in
# core_cell_routing_test.py).
# The two peer pipes on the same site share the same site_name and token,
# but are differentiated by their modes.
if not token:
# The configured token (e.g. "{JOB_ID}") resolved to an empty string.
# An empty token cannot uniquely name the cell (all such pipes on the
# site would collide), so fail fast. A generated fallback is not an
# option: the two ends of a pipe pair derive each other's names
# independently, so any per-process unique value would break the
# pair's rendezvous.
raise ValueError("invalid CellPipe token: token must be a non-empty string")
if token.startswith("alias-"):
# a plain leaf "cellpipe-alias-..._<mode>" would collide with the
# relay alias namespace and could parse to a fabricated owner
raise ValueError(f"invalid CellPipe token '{token}': the 'alias-' prefix is reserved for alias cell names")

cell_name = f"{CELL_PIPE_LEAF_PREFIX}{token}_{mode}"
if parent_fqcn == FQCN.ROOT_SERVER:
# A "." in the token adds phantom FQCN segments, but root-connected
# cells route through the root fall-through regardless of depth, so
# dotted user-chosen tokens (e.g. agent ids) keep working as they did
# before the topology naming.
prefix = site_name
elif FQCN.split(parent_fqcn)[-1] == site_name:
elif parent_fqcn and FQCN.split(parent_fqcn)[-1] == site_name:
# connecting to the site's own CP: the site is already the last segment
if "." in token:
# a "." would put an unconnected FQCN parent between the CP and
# the cell, making the cell unreachable
raise ValueError(
f"invalid CellPipe token '{token}': '.' would split the cell name into extra FQCN segments"
)
prefix = parent_fqcn
elif parent_fqcn:
# Connecting to another cell (e.g. a relay): use the alias leaf
# cellpipe-alias-<site>_<token>_<mode> so mTLS identity resolution and
# stream auth map the cell to the owning site (see
# parse_cell_pipe_alias). The token is the alias runtime id and must
# not contain "_" or ".", or the alias would parse to the wrong owner
# (or not parse at all) and the connection/messages would be rejected.
# Tokens are normally job ids (UUIDs), so this only affects custom
# tokens such as FlareAgentWithCellPipe agent ids.
if "_" in token or "." in token:
raise ValueError(
f"invalid CellPipe token '{token}': must not contain '_' or '.' "
f"when connected through another cell ({parent_fqcn})"
)
prefix = parent_fqcn
cell_name = make_cell_pipe_alias(site_name, token, mode)
else:
# connecting to another cell (e.g. a relay): scope the name to the site
prefix = FQCN.join([parent_fqcn, site_name])
return FQCN.join([prefix, token, mode])
# Missing parent FQCN: keep the cell under the owning site so routing
# still has a topology-shaped parent instead of creating <site>.<token>.
# No in-tree caller hits this today; warn so a misconfiguration that
# produces a correct-looking name is still diagnosable.
_logger.warning(
f"CellPipe conn props have no parent FQCN; naming cell under site '{site_name}' "
f"and relying on root routing"
)
prefix = site_name
Comment thread
greptile-apps[bot] marked this conversation as resolved.
return FQCN.join([prefix, cell_name])


def _to_cell_message(msg: Message, extra=None) -> CellMessage:
Expand Down
Loading
Loading