Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
227 changes: 227 additions & 0 deletions .ci/cvmfs_catalog_analysis
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
#!/usr/bin/env python3
"""Analyze .cvmfscatalog placement under a container sandbox prefix.

For each catalog boundary, reports how many filesystem entries (files +
directories) it "owns" — i.e., entries in its subtree that are NOT delegated
to a deeper nested catalog. Also flags large directories that have no catalog
boundary and might benefit from one.

Usage:
.ci/cvmfs_catalog_analysis [OPTIONS] PREFIX

PREFIX Root of the sandbox to analyze, e.g.:
/cvmfs/singularity.opensciencegrid.org/eicweb/eic_xl:nightly

Options:
--min-entries N Suggest adding a catalog when a non-catalog directory's
owned-entry count exceeds N (default: 500)
--max-entries N Warn when a catalog's owned-entry count exceeds N
(default: 200000)
--depth N Maximum directory depth to walk below PREFIX (default: 8)
--help Show this help and exit

Output columns:
STATUS CATALOG - directory has a .cvmfscatalog marker
SUGGEST(n) - directory has no catalog but > --min-entries owned
WARN(n) - catalog has > --max-entries owned entries
OWNED Entries owned by this catalog (not delegated to children)
TOTAL Total entries in the subtree (including delegated children)
PATH Path relative to PREFIX
"""

import argparse
import os
import sys
from collections import defaultdict


def parse_args():
p = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
add_help=False,
)
p.add_argument("prefix", metavar="PREFIX")
p.add_argument("--min-entries", type=int, default=500)
p.add_argument("--max-entries", type=int, default=200000)
p.add_argument("--depth", type=int, default=8)
p.add_argument("--help", action="help")
return p.parse_args()


def walk_tree(prefix, max_depth):
"""Yield (abs_path, rel_path, depth, has_catalog, child_count, total_count).

child_count = direct children of this directory
total_count = all entries in subtree (recursive)
"""
prefix = os.path.realpath(prefix)

# First pass: collect totals with os.walk (don't follow symlinks so we
# don't escape the sandbox).
dir_children = {} # abs_path -> count of direct children
dir_total = {} # abs_path -> total entries in subtree
Comment thread
wdconinc marked this conversation as resolved.
Outdated

for root, dirs, files in os.walk(prefix, followlinks=False, onerror=lambda e: None):
rel = os.path.relpath(root, prefix)
depth = 0 if rel == "." else rel.count(os.sep) + 1

if depth > max_depth:
dirs.clear()
continue

entries = dirs + files
dir_children[root] = len(entries)

# Accumulate totals: add our direct entries to all ancestors
path = root
while True:
dir_total[path] = dir_total.get(path, 0) + len(entries)
parent = os.path.dirname(path)
if parent == path or not path.startswith(prefix):
break
path = parent

# Second pass: for each directory, compute "owned" count.
# owned = total - sum(total of direct children that are catalog roots)
# We need the set of catalog roots first.
catalog_roots = set()
for root, dirs, files in os.walk(prefix, followlinks=False, onerror=lambda e: None):
rel = os.path.relpath(root, prefix)
depth = 0 if rel == "." else rel.count(os.sep) + 1
if depth > max_depth:
dirs.clear()
continue
if ".cvmfscatalog" in files:
catalog_roots.add(root)

# For each catalog root, owned = total_in_subtree - sum(total of nested catalogs)
# total_in_subtree[d] = all entries below d at any depth
# For a catalog root R, subtract all nested catalog subtree totals
# (but only direct nested — because indirect ones are already subtracted at
# the intermediate level).
#
# Simpler algorithm: for each directory d, owned(d) = entries directly in d's
# catalog = walk subtree of d, stop descending at any nested catalog boundary.

def owned_count(start):
"""Count entries in start's subtree that belong to start's catalog."""
count = 0
stack = [start]
while stack:
d = stack.pop()
try:
entries = os.scandir(d)
except OSError:
continue
children_dirs = []
for entry in entries:
if entry.name == ".cvmfscatalog":
continue
count += 1
if entry.is_dir(follow_symlinks=False):
child_path = entry.path
# Stop at nested catalog boundaries (other than start itself)
if child_path != start and child_path in catalog_roots:
pass # don't descend — owned by nested catalog
else:
rel_depth = child_path[len(prefix):].count(os.sep)
if rel_depth <= max_depth:
stack.append(child_path)
return count

# Collect results for output
results = []
for root, dirs, files in os.walk(prefix, followlinks=False, onerror=lambda e: None):
rel = os.path.relpath(root, prefix)
depth = 0 if rel == "." else rel.count(os.sep) + 1
if depth > max_depth:
dirs.clear()
continue

has_catalog = ".cvmfscatalog" in files
total = dir_total.get(root, 0)
owned = owned_count(root) if has_catalog else None
results.append((root, rel, depth, has_catalog, owned, total))

Comment thread
wdconinc marked this conversation as resolved.
return results, catalog_roots


def main():
args = parse_args()
prefix = os.path.realpath(args.prefix)

if not os.path.isdir(prefix):
print(f"error: prefix not found: {prefix}", file=sys.stderr)
sys.exit(1)

print(f"Analyzing {prefix}", file=sys.stderr)
results, catalog_roots = walk_tree(prefix, args.depth)

# Determine which non-catalog directories are "large" (i.e., owned count >
# --min-entries when treating the directory as if it had its own catalog).
# We compute owned for non-catalog dirs too, lazily.
def owned_count_nc(start):
"""Owned count treating start as a catalog boundary (for suggestion check)."""
count = 0
stack = [start]
while stack:
d = stack.pop()
try:
entries = os.scandir(d)
except OSError:
continue
for entry in entries:
if entry.name == ".cvmfscatalog":
continue
count += 1
if entry.is_dir(follow_symlinks=False):
child_path = entry.path
if child_path in catalog_roots:
pass
else:
rel_depth = child_path[len(prefix):].count(os.sep)
if rel_depth <= args.depth:
stack.append(child_path)
return count

print(f"{'STATUS':<20} {'OWNED':>8} {'TOTAL':>8} PATH")
print("-" * 80)

warned = []
suggested = []

for root, rel, depth, has_catalog, owned, total in sorted(results, key=lambda r: r[1]):
display_path = "/" + rel if rel != "." else "/"

if has_catalog:
if owned > args.max_entries:
status = f"WARN({owned})"
warned.append(display_path)
else:
status = "CATALOG"
print(f"{status:<20} {owned:>8} {total:>8} {display_path}")
else:
# Only report non-catalog dirs that are interesting (could benefit from catalog)
# Heuristic: check if the directory's contribution to its parent catalog
# is large — approximate via total minus delegated children.
nc_owned = owned_count_nc(root)
if nc_owned > args.min_entries:
status = f"SUGGEST({nc_owned})"
suggested.append((display_path, nc_owned))
print(f"{status:<20} {nc_owned:>8} {total:>8} {display_path}")

print()
print(f"Catalog roots found: {len(catalog_roots)}")
if warned:
print(f"\nWARNING: {len(warned)} catalog(s) exceed {args.max_entries} entries:")
for p in warned:
print(f" {p}")
if suggested:
print(f"\nSUGGESTION: {len(suggested)} director(ies) may benefit from a catalog (> {args.min_entries} owned entries):")
Comment thread
wdconinc marked this conversation as resolved.
Outdated
for p, n in sorted(suggested, key=lambda x: -x[1]):
print(f" {n:>8} {p}")


if __name__ == "__main__":
main()
12 changes: 12 additions & 0 deletions containers/debian/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,18 @@ git -C ${EICSPACK_ROOT} checkout ${EICSPACK_SHA:-${EICSPACK_VERSION}}
spack repo add --scope spack "${EICSPACK_ROOT}/spack_repo/eic"
EOF

## Place cvmfs catalogs for system directories
## /etc and /usr subtrees are static (installed once at base-image build time)
## and are large enough to warrant their own catalog boundaries so that CVMFS
## clients do not have to load the entire root catalog to traverse them.
Comment thread
wdconinc marked this conversation as resolved.
Outdated
RUN <<EOF
set -e
touch /etc/.cvmfscatalog
touch /usr/.cvmfscatalog
find /usr -mindepth 1 -maxdepth 1 -type d -exec touch {}/.cvmfscatalog \;
find /usr -mindepth 2 -maxdepth 2 -type d -exec touch {}/.cvmfscatalog \;
EOF

## Ensure NVIDIA driver exposes all capabilities
## https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
ARG NVIDIA_VISIBLE_DEVICES
Expand Down
22 changes: 21 additions & 1 deletion containers/eic/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -334,10 +334,30 @@ ENV CMAKE_TOOLCHAIN_FILE=/opt/local/etc/cmake/find_package_resolve_symlinks.cmak
## Place cvmfs catalogs
RUN <<EOF
set -e
# Spack installation trees
touch ${SPACK_ROOT}/.cvmfscatalog
touch /opt/spack-packages/.cvmfscatalog
# Install tree: root catalog, one catalog per arch dir, one per package.
# Stopping at depth 2 (package level) avoids creating wasteful per-subdir
# catalogs (lib/, bin/, share/, etc.) that change atomically with the package.
touch /opt/software/.cvmfscatalog
find /opt/software -mindepth 2 -maxdepth 3 -type d -exec touch {}/.cvmfscatalog \;
find /opt/software -mindepth 1 -maxdepth 1 -type d -exec touch {}/.cvmfscatalog \;
find /opt/software -mindepth 2 -maxdepth 2 -type d -exec touch {}/.cvmfscatalog \;
# Spack view: catalog per top-level subdir so that updating a package only
# invalidates the view subdirs it contributes to (lib/, bin/, etc.),
# not the entire merged view.
touch /opt/local/.cvmfscatalog
find /opt/local -mindepth 1 -maxdepth 1 -type d -exec touch {}/.cvmfscatalog \;
# Detector: each version is an independent Spack install sub-tree
touch /opt/detector/.cvmfscatalog
find /opt/detector -mindepth 1 -maxdepth 1 -type d -exec touch {}/.cvmfscatalog \;
# Benchmark and campaign git repos each change independently
touch /opt/benchmarks/.cvmfscatalog
find /opt/benchmarks -mindepth 1 -maxdepth 1 -type d -exec touch {}/.cvmfscatalog \;
touch /opt/campaigns/.cvmfscatalog
find /opt/campaigns -mindepth 1 -maxdepth 1 -type d -exec touch {}/.cvmfscatalog \;
Comment thread
wdconinc marked this conversation as resolved.
Outdated
# Spack environment configs
touch /opt/spack-environment/.cvmfscatalog
EOF

## Store environment
Expand Down
Loading