From a361e277cdab1204c5dfa79da56b1a69287d12df Mon Sep 17 00:00:00 2001 From: zhangzhibiao Date: Fri, 22 May 2026 10:34:00 +0800 Subject: [PATCH 1/7] Place build artifacts under output --- CMakeLists.txt | 40 ++++++++++++++++++++++-- Makefile | 51 ++++++++++++++++++++++--------- benchmarks/mstress/CMakeLists.txt | 2 ++ examples/cc/CMakeLists.txt | 3 ++ src/cc/devtools/CMakeLists.txt | 2 ++ src/cc/emulator/CMakeLists.txt | 2 ++ src/cc/qcrs/CMakeLists.txt | 2 ++ src/cc/qfsc/CMakeLists.txt | 2 ++ src/cc/tools/CMakeLists.txt | 2 ++ 9 files changed, 90 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 297d17bd7..330ece184 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,10 +135,46 @@ if(NOT OPENSSL_VERSION OR OPENSSL_VERSION MATCHES "^0[.]") message(STATUS "WARNING: QFS authentication will not work properly") endif() +if(NOT DEFINED QFS_OUTPUT_DIR) + set(QFS_OUTPUT_DIR "${KFS_DIR_PREFIX}/output" CACHE PATH + "directory for deployable QFS build output") +endif() +get_filename_component(QFS_OUTPUT_DIR "${QFS_OUTPUT_DIR}" ABSOLUTE + BASE_DIR "${KFS_DIR_PREFIX}") + # Change this to where the install directory is located -if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) - set(CMAKE_INSTALL_PREFIX "." CACHE PATH "install directory prefix" FORCE) +if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT OR + CMAKE_INSTALL_PREFIX STREQUAL ".") + set(CMAKE_INSTALL_PREFIX "${QFS_OUTPUT_DIR}" CACHE PATH + "install directory prefix" FORCE) endif() +message(STATUS "QFS deployable output directory: ${CMAKE_INSTALL_PREFIX}") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${QFS_OUTPUT_DIR}/bin" CACHE PATH + "directory for QFS runtime build output" FORCE) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${QFS_OUTPUT_DIR}/lib" CACHE PATH + "directory for QFS shared library build output" FORCE) +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${QFS_OUTPUT_DIR}/lib/static" CACHE PATH + "directory for QFS static library build output" FORCE) +foreach(QFS_OUTPUT_CONFIG DEBUG RELEASE RELWITHDEBINFO MINSIZEREL) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_${QFS_OUTPUT_CONFIG} + "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}" CACHE PATH + "directory for QFS runtime build output" FORCE) + set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_${QFS_OUTPUT_CONFIG} + "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" CACHE PATH + "directory for QFS shared library build output" FORCE) + set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_${QFS_OUTPUT_CONFIG} + "${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}" CACHE PATH + "directory for QFS static library build output" FORCE) +endforeach() + +function(qfs_set_target_runtime_output_dir output_dir) + set_target_properties(${ARGN} PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${output_dir}") + foreach(QFS_OUTPUT_CONFIG DEBUG RELEASE RELWITHDEBINFO MINSIZEREL) + set_target_properties(${ARGN} PROPERTIES + RUNTIME_OUTPUT_DIRECTORY_${QFS_OUTPUT_CONFIG} "${output_dir}") + endforeach() +endfunction() # Build with statically linked libraries; the value for this variable has to be # defined here overwriting whatever is in the cache. diff --git a/Makefile b/Makefile index 9a1da4d61..1c10e3227 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,7 @@ # Do not assume GNU Make. Keep this makefile as simple as possible. BUILD_TYPE=release +QFS_OUTPUT_DIR=output CMAKE_OPTIONS=-D CMAKE_BUILD_TYPE=RelWithDebInfo CMAKE=cmake MAKE_OPTIONS= @@ -31,7 +32,7 @@ QFSHADOOP_VERSIONS=0.23.11 1.0.4 1.1.2 2.5.1 2.7.2 2.7.7 2.8.5 2.9.2 2.1 QFS_PYTHON_DIR=python-qfs QFS_PYTHON_WHEEL_DIR=${QFS_PYTHON_DIR}/dist -QFS_PYTHON_TEST_OPTION=test -d ${QFS_PYTHON_WHEEL_DIR} && echo -python-wheel-dir ${QFS_PYTHON_WHEEL_DIR} +QFS_PYTHON_TEST_OPTION=test -d $${qfs_output_dir}/${QFS_PYTHON_WHEEL_DIR} && echo -python-wheel-dir $${qfs_output_dir}/${QFS_PYTHON_WHEEL_DIR} QFS_MSTRESS_ON=true .PHONY: all @@ -39,11 +40,16 @@ all: build .PHONY: dir dir: - mkdir -p build/${BUILD_TYPE} + mkdir -p build/${BUILD_TYPE} ${QFS_OUTPUT_DIR} .PHONY: run-cmake run-cmake: dir - cd build/${BUILD_TYPE} && ${CMAKE} ${CMAKE_OPTIONS} ../.. + cd build/${BUILD_TYPE} && \ + qfs_output_dir=`cd ../.. && pwd`/${QFS_OUTPUT_DIR} && \ + ${CMAKE} \ + -D QFS_OUTPUT_DIR="$$qfs_output_dir" \ + -D CMAKE_INSTALL_PREFIX="$$qfs_output_dir" \ + ${CMAKE_OPTIONS} ../.. .PHONY: build build: run-cmake @@ -51,11 +57,19 @@ build: run-cmake `${QFS_MSTRESS_ON} && \ echo ${QFSHADOOP_VERSIONS} | grep '3\.4\.1' >/dev/null 2>&1 && \ mvn --version >/dev/null 2>&1 && echo mstress-bootstrap mstress-tarball` + if ls -1 build/${BUILD_TYPE}/benchmarks/mstress*.tgz >/dev/null 2>&1; then \ + mkdir -p ${QFS_OUTPUT_DIR}/benchmarks && \ + cp build/${BUILD_TYPE}/benchmarks/mstress*.tgz \ + ${QFS_OUTPUT_DIR}/benchmarks/; \ + fi .PHONY: java java: build ./src/java/javabuild.sh ${JAVA_BUILD_OPTIONS} clean ./src/java/javabuild.sh ${JAVA_BUILD_OPTIONS} + if ls -1 build/java/qfs-access/qfs-access*.jar >/dev/null 2>&1; then \ + cp build/java/qfs-access/qfs-access*.jar ${QFS_OUTPUT_DIR}/lib/; \ + fi .PHONY: hadoop-jars hadoop-jars: java @@ -67,6 +81,9 @@ hadoop-jars: java || exit 1; \ done \ ; fi + if ls -1 build/java/hadoop-qfs/hadoop-*.jar >/dev/null 2>&1; then \ + cp build/java/hadoop-qfs/hadoop-*.jar ${QFS_OUTPUT_DIR}/lib/; \ + fi .PHONY: go go: build @@ -76,7 +93,7 @@ go: build exit; \ } \ END { exit ret ? 0 : 1 }'; then \ - QFS_BUILD_DIR=`pwd`/build/$(BUILD_TYPE) && \ + QFS_BUILD_DIR=`pwd`/${QFS_OUTPUT_DIR} && \ cd src/go && \ CGO_CFLAGS="-I$${QFS_BUILD_DIR}/include" && \ export CGO_CFLAGS && \ @@ -93,6 +110,7 @@ go: build .PHONY: tarball tarball: hadoop-jars python cd build && \ + qfs_output_dir=../${QFS_OUTPUT_DIR}; \ myuname=`uname -s`; \ myarch=`cc -dumpmachine 2>/dev/null | cut -d - -f 1` ; \ [ x"$$myarch" = x ] && \ @@ -129,16 +147,16 @@ tarball: hadoop-jars python { test -d tmpreldir || mkdir tmpreldir; } && \ rm -rf "tmpreldir/$$tarname" && \ mkdir "tmpreldir/$$tarname" && \ - cp -r ${BUILD_TYPE}/bin ${BUILD_TYPE}/lib \ - ${BUILD_TYPE}/include ../scripts ../webui \ + cp -r $$qfs_output_dir/bin $$qfs_output_dir/lib \ + $$qfs_output_dir/include ../scripts ../webui \ ../examples ../benchmarks "tmpreldir/$$tarname/" && \ if ls -1 ./java/qfs-access/qfs-access-*.jar >/dev/null 2>&1; then \ cp ./java/qfs-access/qfs-access*.jar "tmpreldir/$$tarname/lib/"; fi && \ if ls -1 ./java/hadoop-qfs/hadoop-*.jar >/dev/null 2>&1; then \ cp ./java/hadoop-qfs/hadoop-*.jar "tmpreldir/$$tarname/lib/"; fi && \ - if ls -1 ${BUILD_TYPE}/${QFS_PYTHON_WHEEL_DIR}/qfs*.whl >/dev/null 2>&1; \ + if ls -1 $$qfs_output_dir/${QFS_PYTHON_WHEEL_DIR}/qfs*.whl >/dev/null 2>&1; \ then \ - cp ${BUILD_TYPE}/${QFS_PYTHON_WHEEL_DIR}/qfs*.whl \ + cp $$qfs_output_dir/${QFS_PYTHON_WHEEL_DIR}/qfs*.whl \ "tmpreldir/$$tarname/lib/"; fi && \ if ls -1 ${BUILD_TYPE}/benchmarks/mstress.tgz > /dev/null 2>&1; then \ cp ${BUILD_TYPE}/benchmarks/mstress.tgz \ @@ -151,7 +169,7 @@ python: build if python3 -c 'import sys; exit(0 if sys.version_info >= (3, 6) else 1)' \ >/dev/null 2>&1 && \ python3 -c 'import venv' >/dev/null 2>&1 ; then \ - cd build/${BUILD_TYPE} && \ + cd ${QFS_OUTPUT_DIR} && \ rm -rf ${QFS_PYTHON_DIR} && \ mkdir ${QFS_PYTHON_DIR} && \ cd ${QFS_PYTHON_DIR} && \ @@ -159,7 +177,10 @@ python: build ln -s ../../../src/cc/access/kfs_setup.py setup.py && \ python3 -m venv .venv && \ . .venv/bin/activate && python -m pip install build && \ - python -m build -w . ; \ + python -m build -w . && \ + if ls -1 dist/qfs*.whl >/dev/null 2>&1; then \ + cp dist/qfs*.whl ../lib/; \ + fi ; \ else \ echo 'python3 module venv is not available'; \ fi @@ -167,14 +188,16 @@ python: build .PHONY: mintest mintest: hadoop-jars python cd build/${BUILD_TYPE} && \ + qfs_output_dir=`cd ../.. && pwd`/${QFS_OUTPUT_DIR} && \ ../../src/test-scripts/qfstest.sh \ `${QFS_PYTHON_TEST_OPTION}` \ - -install-prefix . -auth ${QFSTEST_OPTIONS} + -install-prefix "$$qfs_output_dir" -auth ${QFSTEST_OPTIONS} .PHONY: test test: mintest cd build/${BUILD_TYPE} && \ - installbindir=`pwd`/bin && \ + qfs_output_dir=`cd ../.. && pwd`/${QFS_OUTPUT_DIR} && \ + installbindir=$$qfs_output_dir/bin && \ metadir=$$installbindir && \ export metadir && \ chunkdir=$$installbindir && \ @@ -192,7 +215,7 @@ test: mintest echo '--------- Test without authentication --------' && \ ../../src/test-scripts/qfstest.sh \ `${QFS_PYTHON_TEST_OPTION}` \ - -install-prefix . -noauth ${QFSTEST_OPTIONS} ; \ + -install-prefix "$$qfs_output_dir" -noauth ${QFSTEST_OPTIONS} ; \ fi .PHONY: rat @@ -201,4 +224,4 @@ rat: dir .PHONY: clean clean: - rm -rf build + rm -rf build ${QFS_OUTPUT_DIR} diff --git a/benchmarks/mstress/CMakeLists.txt b/benchmarks/mstress/CMakeLists.txt index 2f168d4a1..e6be75856 100644 --- a/benchmarks/mstress/CMakeLists.txt +++ b/benchmarks/mstress/CMakeLists.txt @@ -20,6 +20,8 @@ # add_executable(mstress_client EXCLUDE_FROM_ALL mstress_client.cc) +qfs_set_target_runtime_output_dir("${CMAKE_CURRENT_BINARY_DIR}" + mstress_client) if(USE_STATIC_LIB_LINKAGE) add_dependencies(mstress_client kfsClient) diff --git a/examples/cc/CMakeLists.txt b/examples/cc/CMakeLists.txt index d87e0ac0c..e12aa000d 100644 --- a/examples/cc/CMakeLists.txt +++ b/examples/cc/CMakeLists.txt @@ -32,5 +32,8 @@ else (USE_STATIC_LIB_LINKAGE) add_dependencies (qfssample kfsClient-shared) endif (USE_STATIC_LIB_LINKAGE) +qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/examples" + qfssample) + install (TARGETS qfssample RUNTIME DESTINATION bin/examples) diff --git a/src/cc/devtools/CMakeLists.txt b/src/cc/devtools/CMakeLists.txt index f527b2086..9b2f304be 100644 --- a/src/cc/devtools/CMakeLists.txt +++ b/src/cc/devtools/CMakeLists.txt @@ -55,6 +55,8 @@ foreach (exe_file ${exe_files}) ) endif (USE_STATIC_LIB_LINKAGE) endforeach (exe_file) +qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/devtools" + ${exe_files}) # install (TARGETS ${exe_files} diff --git a/src/cc/emulator/CMakeLists.txt b/src/cc/emulator/CMakeLists.txt index 1c0b8caa6..bd00d91d3 100644 --- a/src/cc/emulator/CMakeLists.txt +++ b/src/cc/emulator/CMakeLists.txt @@ -52,6 +52,8 @@ foreach (exe_file ${exe_files}) kfsEmulator ) endforeach (exe_file) +qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/emulator" + ${exe_files}) if (CMAKE_SYSTEM_NAME STREQUAL "SunOS") target_link_libraries(kfsEmulator mtmalloc) diff --git a/src/cc/qcrs/CMakeLists.txt b/src/cc/qcrs/CMakeLists.txt index c4a6efb7a..28c31a05e 100644 --- a/src/cc/qcrs/CMakeLists.txt +++ b/src/cc/qcrs/CMakeLists.txt @@ -166,6 +166,8 @@ add_executable (${rsmktablebin} mktable_main.c) target_link_libraries (${rstestbin} kfsrs) add_dependencies (${rstestbin} kfsrs) add_dependencies (${rsmktablebin} kfsrs) +qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/devtools" + ${rstestbin} ${rsmktablebin}) install (TARGETS kfsrs kfsrs-shared LIBRARY DESTINATION lib diff --git a/src/cc/qfsc/CMakeLists.txt b/src/cc/qfsc/CMakeLists.txt index f66d91acb..3b54fffdb 100644 --- a/src/cc/qfsc/CMakeLists.txt +++ b/src/cc/qfsc/CMakeLists.txt @@ -44,6 +44,8 @@ target_link_libraries (qfsc-shared add_executable (test-qfsc test-qfsc.c) set_target_properties (test-qfsc PROPERTIES LINKER_LANGUAGE CXX) +qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/devtools" + test-qfsc) if (USE_STATIC_LIB_LINKAGE) add_dependencies (test-qfsc qfsc) diff --git a/src/cc/tools/CMakeLists.txt b/src/cc/tools/CMakeLists.txt index 74ca9c395..5f71fa688 100644 --- a/src/cc/tools/CMakeLists.txt +++ b/src/cc/tools/CMakeLists.txt @@ -102,6 +102,8 @@ foreach (exe_file ${exe_files}) target_link_libraries (${exe_file} tools-shared) endif (USE_STATIC_LIB_LINKAGE) endforeach (exe_file) +qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/tools" + ${exe_files}) # install (TARGETS ${exe_files} tools From b2fcfb5262794690a6e06538509b2728bced1715 Mon Sep 17 00:00:00 2001 From: zhangzhibiao Date: Mon, 25 May 2026 13:43:36 +0800 Subject: [PATCH 2/7] Improve mstress local benchmark behavior --- benchmarks/mstress/mstress.py | 70 +++++++++++++++++++--------- benchmarks/mstress/mstress_client.cc | 50 ++++++++++---------- benchmarks/mstress/mstress_plan.py | 7 ++- benchmarks/mstress/mstress_run.py | 29 ++++++------ 4 files changed, 91 insertions(+), 65 deletions(-) diff --git a/benchmarks/mstress/mstress.py b/benchmarks/mstress/mstress.py index 173dd5d4b..2f3d0a14b 100755 --- a/benchmarks/mstress/mstress.py +++ b/benchmarks/mstress/mstress.py @@ -162,14 +162,26 @@ def PrintMemoryUsage(opts): Globals.SERVER_KEYWORD, ) - proc = subprocess.Popen( - ["ssh", opts.server, psCmd], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) + if opts.server in ("localhost", "127.0.0.1"): + proc = subprocess.Popen( + [psCmd], + shell=True, + executable="/bin/bash", + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + else: + proc = subprocess.Popen( + ["ssh", opts.server, psCmd], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) result = proc.communicate() if result and len(result[0].strip()) > 0: - print("Memory usage %sKB" % result[0].strip()) + memory = result[0].strip() + if not isinstance(memory, str): + memory = memory.decode("utf-8", "replace") + print("Memory usage %sKB" % memory) else: print("Memory usage KB") @@ -189,7 +201,7 @@ def RunMStressMaster(opts, hostsList): # print 'Master: called with %r, %r' % (opts, hostsList) startTime = datetime.datetime.now() - if RunMStressMasterTest(opts, hostsList, "create"): + if not RunMStressMasterTest(opts, hostsList, "create"): return False deltaTime = datetime.datetime.now() - startTime print( @@ -200,7 +212,7 @@ def RunMStressMaster(opts, hostsList): print("==========================================") startTime = datetime.datetime.now() - if RunMStressMasterTest(opts, hostsList, "stat"): + if not RunMStressMasterTest(opts, hostsList, "stat"): return False deltaTime = datetime.datetime.now() - startTime print( @@ -210,7 +222,7 @@ def RunMStressMaster(opts, hostsList): print("==========================================") startTime = datetime.datetime.now() - if RunMStressMasterTest(opts, hostsList, "readdir"): + if not RunMStressMasterTest(opts, hostsList, "readdir"): return False deltaTime = datetime.datetime.now() - startTime print( @@ -221,10 +233,10 @@ def RunMStressMaster(opts, hostsList): if opts.leave_files: print("\nNot deleting files because of -l option") - return False + return True startTime = datetime.datetime.now() - if RunMStressMasterTest(opts, hostsList, "delete"): + if not RunMStressMasterTest(opts, hostsList, "delete"): return False deltaTime = datetime.datetime.now() - startTime print( @@ -273,16 +285,30 @@ def RunMStressMasterTest(opts, hostsList, test): + opts.filesystem + ".slave.log" ) - p = subprocess.Popen( - [ - "/usr/bin/ssh", - client, - "%s -c %s -k %s >& %s" - % (ssh_cmd, client, clientHostMapping[client], slaveLogfile), - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + slave_cmd = "%s -c %s -k %s >& %s" % ( + ssh_cmd, + client, + clientHostMapping[client], + slaveLogfile, ) + if client in ("localhost", "127.0.0.1"): + p = subprocess.Popen( + [slave_cmd], + shell=True, + executable="/bin/bash", + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + else: + p = subprocess.Popen( + [ + "/usr/bin/ssh", + client, + slave_cmd, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) running_procs[p] = client success = True @@ -323,7 +349,7 @@ def RunMStressMasterTest(opts, hostsList, test): else: sys.stdout.write(".") sys.stdout.flush() - time.sleep(0.5) + time.sleep(0.05) return success @@ -439,7 +465,7 @@ def RunMStressSlave(opts, clientsPerHost): else: sys.stdout.write(".") sys.stdout.flush() - time.sleep(0.5) + time.sleep(0.05) return success diff --git a/benchmarks/mstress/mstress_client.cc b/benchmarks/mstress/mstress_client.cc index 15f416de4..96a7c1620 100644 --- a/benchmarks/mstress/mstress_client.cc +++ b/benchmarks/mstress/mstress_client.cc @@ -35,8 +35,8 @@ #include #include #include -#include #include +#include #if __cplusplus >= 201103L #include @@ -185,11 +185,9 @@ void hexout(char* str, int len) { printf("\n"); } -void myitoa(int n, char* buf) +void myitoa(int n, char* buf, size_t len = 32) { - static char result[32]; - snprintf(result, 32, "%d", n); - strcpy(buf, result); + snprintf(buf, len, "%d", n); } //Return a random permutation of numbers in [0..range). @@ -343,7 +341,7 @@ int CreateDFSPaths(Client* client, AutoCleanupKfsClient* kfs, int level, int* cr char name[512]; strncpy(name, client->prefix_.c_str(), sizeof(name) / sizeof(name[0]) - 1); for (int i = 0; i < client->inodesPerLevel_; i++) { - myitoa(i, name + client->prefixLen_); + myitoa(i, name + client->prefixLen_, sizeof(name) - client->prefixLen_); client->path_.Push(name); //hexout(client->path_.actualPath_, client->path_.len_ + 3); @@ -433,16 +431,16 @@ int StatDFSPaths(Client* client, AutoCleanupKfsClient* kfs) { for (int d = 0; d < client->levels_; d++) { int randIdx = rand() % client->inodesPerLevel_; - myitoa(randIdx, name + client->prefixLen_); + myitoa(randIdx, name + client->prefixLen_, sizeof(name) - client->prefixLen_); client->path_.Push(name); //fprintf(logFile, "Stat: path now is %s\n", client->path_.actualPath_); } //fprintf(logFile, "Stat: doing stat on [%s]\n", client->path_.actualPath_); KFS::KfsFileAttr attr; - int err = kfsClient->Stat(os.str().c_str(), attr); + int err = kfsClient->Stat(client->path_.String(), attr); if (err) { - fprintf(logFile, "error doing stat on %s\n", os.str().c_str()); + fprintf(logFile, "error doing stat on %s\n", client->path_.String()); return err; } @@ -466,14 +464,15 @@ int ListDFSPaths(Client* client, AutoCleanupKfsClient* kfs) { gettimeofday(&tvAlpha, NULL); int inodeCount = 0; - queue pending; + deque pending; ostringstream os; os << TEST_BASE_DIR << "/" << client->hostName_ + "_" << client->processName_; - pending.push(os.str()); + pending.push_back(os.str()); while (!pending.empty()) { - string parent = pending.front(); - pending.pop(); + string parent; + parent.swap(pending.front()); + pending.pop_front(); //fprintf(logFile, "readdir on parent [%s]\n", parent.c_str()); vector children; int err = kfsClient->ReaddirPlus(parent.c_str(), children); @@ -482,20 +481,19 @@ int ListDFSPaths(Client* client, AutoCleanupKfsClient* kfs) { return err; } while (!children.empty()) { - string child = children.back().filename; - bool isDir = children.back().isDirectory; - children.pop_back(); + const KFS::KfsFileAttr& childAttr = children.back(); + const string& child = childAttr.filename; + bool isDir = childAttr.isDirectory; //fprintf(logFile, " Child = %s inodeCount=%d\n", child.c_str(), inodeCount); - if (child == "." || - child == "..") { - continue; - } - inodeCount ++; - if (isDir) { - string nextParent = parent + "/" + child; - pending.push(nextParent); - //fprintf(logFile, " Adding next parent [%s]\n", nextParent.c_str()); + if (child != "." && child != "..") { + inodeCount ++; + if (isDir) { + string nextParent = parent + "/" + child; + pending.push_back(nextParent); + //fprintf(logFile, " Adding next parent [%s]\n", nextParent.c_str()); + } } + children.pop_back(); if (inodeCount > 0 && inodeCount % COUNT_INCR == 0) { fprintf(logFile, "Readdir paths so far: %d\n", inodeCount); } @@ -546,7 +544,7 @@ int RemoveDFSPaths(Client* client, AutoCleanupKfsClient* kfs) { while (lev < client->levels_) { pos = idx / client->inodesPerLevel_; delta = idx - (pos * client->inodesPerLevel_); - myitoa(delta, sfx); + myitoa(delta, sfx, sizeof(sfx)); if (pathSoFar.length()) { pathSoFar = client->prefix_ + sfx + "/" + pathSoFar; } else { diff --git a/benchmarks/mstress/mstress_plan.py b/benchmarks/mstress/mstress_plan.py index 6388e531c..4cfcbc855 100755 --- a/benchmarks/mstress/mstress_plan.py +++ b/benchmarks/mstress/mstress_plan.py @@ -183,6 +183,9 @@ def main(): print("==> Created planfile: %s" % opts.output_file) print("copying file %s to all client hosts" % opts.output_file) for client in hostlist: + if client in ("localhost", "127.0.0.1"): + print("available %s on %s" % (opts.output_file, client)) + continue p = subprocess.Popen( [ "/usr/bin/scp", @@ -195,7 +198,9 @@ def main(): if ret is None: time.sleep(0.5) else: - print("transfered %s to %s" % (opts.output_file, client)) + if ret != 0: + sys.exit("failed to transfer %s to %s" % (opts.output_file, client)) + print("transferred %s to %s" % (opts.output_file, client)) break diff --git a/benchmarks/mstress/mstress_run.py b/benchmarks/mstress/mstress_run.py index 05dde0899..ef564d3e8 100755 --- a/benchmarks/mstress/mstress_run.py +++ b/benchmarks/mstress/mstress_run.py @@ -45,7 +45,7 @@ def NumFiles2Stat(): Params.INODES_PER_LEVEL**Params.PATH_LEVELS * Params.CLIENTS_PER_HOST * len(Params.CLIENT_HOSTS.split(",")) - / 2 + // 2 ) NumFiles2Stat = staticmethod(NumFiles2Stat) @@ -136,33 +136,30 @@ def Execute(type, args): % type ) - result = "" + result = [] proc = subprocess.Popen( - args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT + args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + universal_newlines=True ) - while proc.poll() is None: - output = proc.stdout.read(1) - result += output + for output in iter(proc.stdout.readline, ""): + result.append(output) sys.stdout.write(output) sys.stdout.flush() - - output = proc.stdout.read() - result += output - sys.stdout.write(output) - sys.stdout.flush() proc.wait() - return result + return "".join(result) def PrintResult(type, result): PrintMsg("\nBenchmark results for '%s':" % type) for m in re.findall(r"(\w+) test took (\S+) sec", result): PrintMsg("%-10s: %s sec" % (m[0], m[1])) - PrintMsg( - "\n%s\n==========================================" - % re.search(r"Memory usage .*$", result, re.MULTILINE).group(0) - ) + memory = re.search(r"Memory usage .*", result, re.MULTILINE) + if memory: + PrintMsg( + "\n%s\n==========================================" + % memory.group(0) + ) def ParseArgs(): From da95513e4289f5a52c0b2f3029642b55e0851927 Mon Sep 17 00:00:00 2001 From: zhangzhibiao Date: Mon, 25 May 2026 17:07:56 +0800 Subject: [PATCH 3/7] opt lock --- MetaTree-Lock-Optimization.md | 227 ++++++++++++++++++++++++++++++ scripts/run_mstress_create.sh | 39 +++++ scripts/start_clean_metaserver.sh | 86 +++++++++++ 3 files changed, 352 insertions(+) create mode 100644 MetaTree-Lock-Optimization.md create mode 100755 scripts/run_mstress_create.sh create mode 100755 scripts/start_clean_metaserver.sh diff --git a/MetaTree-Lock-Optimization.md b/MetaTree-Lock-Optimization.md new file mode 100644 index 000000000..4330d1eb5 --- /dev/null +++ b/MetaTree-Lock-Optimization.md @@ -0,0 +1,227 @@ +# MetaTree Lock Optimization Plan + +## Background + +The create-file benchmark shows poor scaling when increasing +`metaServer.clientThreadCount`. In the measured runs, `INFO + 4` client threads +was faster than `INFO + 20`, which points to lock contention inside the meta +server rather than chunkserver or data path bottlenecks. + +For empty-file create, the hot path is: + +1. `ClientThread::DispatchStart()` +2. `submit_request()` +3. `MetaRequest::SubmitBegin()` +4. `LogWriter::Enqueue()` +5. `MetaCreate::start()` +6. `MetaCreate::handle()` +7. `Tree::create()` +8. `LogWriter::ScheduleFlush()` + +`Tree::create()` mutates the global metadata tree by doing lookup, optional +remove, fid allocation, dentry/fattr insertion, and count updates. + +## Current Locking Problem + +Before this optimization, client threads held the global net dispatch mutex while +processing pending metadata requests: + +```text +dispatch mutex + submit_request() + MetaRequest::Submit() + LogWriter::Enqueue() + MetaCreate::handle() + metatree.create() + LogWriter::ScheduleFlush() +``` + +This makes `metaServer.clientThreadCount` scale poorly. More client threads +mainly increase contention on the same dispatch mutex. + +## Important Constraint + +The metadata tree is not currently safe for simple per-parent-directory locking. +It is implemented as a single global B-tree. Even creates in different parent +directories can modify shared B-tree nodes, split internal nodes, update the +root, or touch shared indexes. + +Therefore, this is unsafe as a direct first step: + +```text +lock(parent_dir) + Tree::create(parent_dir, name) +``` + +That would protect directory-level semantics but not the global B-tree data +structure. + +## Implemented First Step + +The first step separates the broad dispatch lock from metadata mutation and log +writer state. + +### 1. Add a dedicated metadata request mutex + +`submit_request()` now takes a dedicated metadata processing mutex before calling +`MetaRequest::Submit()`. + +This preserves existing metatree safety while removing metadata processing from +the net dispatch mutex. + +```text +meta request mutex + MetaRequest::Submit() + MetaCreate::handle() + metatree.create() +``` + +### 2. Shrink dispatch mutex scope + +`ClientThread::DispatchStart()` now keeps the dispatch mutex only around fork +coordination and auth context update. It does not hold the dispatch mutex while +processing the request batch. + +This changes the lock shape to: + +```text +dispatch mutex + PrepareToFork() + auth context update + ForkDone() + +meta request mutex + submit_request() +``` + +### 3. Protect LogWriter state with LogWriter mutex + +Moving request processing out of the dispatch mutex means LogWriter can no +longer rely on dispatch serialization. The following paths now explicitly use +`LogWriter::mMutex`: + +```text +LogWriter::Enqueue() +LogWriter::RequestCommitted() +LogWriter::ScheduleFlush() +``` + +This protects pending queues, commit state, and flush scheduling when multiple +client threads reach the log writer concurrently. + +## What This Does Not Yet Solve + +This first step does not make `Tree::create()` itself parallel across parent +directories. It deliberately keeps metadata mutation serialized through the +metadata request mutex. + +The goal is to remove one oversized outer lock and introduce clearer lock +ownership: + +```text +dispatch state -> dispatch mutex +metadata mutation -> metadata request mutex +log writer state -> log writer mutex +``` + +This is a safe prerequisite for deeper metatree concurrency work. + +## Next Steps Toward True MetaTree Concurrency + +### Step 1: Add profiling around the new lock boundaries + +Measure: + +```text +dispatch mutex wait / hold time +metadata request mutex wait / hold time +LogWriter mutex wait / hold time +Tree::create() latency +Tree::lookup() latency +Tree::link() latency +log flush batch size and latency +``` + +This confirms whether contention moved from dispatch mutex to metadata request +mutex or LogWriter. + +### Step 2: Split read-only and mutation requests + +Introduce a metadata operation classification: + +```text +read-only ops +mutation ops +log-dependent mutation ops +``` + +Read-only requests can eventually run under a shared/read lock, while mutation +requests keep exclusive protection. + +### Step 3: Refactor metatree storage for sharding + +True per-directory create parallelism needs the data structure to stop using one +global mutable B-tree for all dentries/fattrs. + +Candidate direction: + +```text +fid/fattr index: separately protected or sharded by fid +dentry index: sharded by parent fid +path cache: separately protected or disabled on mutation-heavy workloads +directory counters: parent-chain locking with stable lock ordering +``` + +Only after this split is it safe to use parent-directory locks for create. + +### Step 4: Add parent-directory locking + +Once dentry storage is sharded by parent fid: + +```text +lock(parent_dir) + check permissions + lookup child name + allocate fid + insert dentry in parent shard + insert fattr in fid shard + update parent counters +``` + +Lock ordering must be explicit. Rename is the main hard case because it touches +two parent directories and may update path cache and subtree invariants. + +### Step 5: Validate replay and transaction ordering + +Create is a logged operation. Any concurrency change must preserve: + +```text +log sequence order +fid seed replay correctness +idempotent request behavior +rename/create/remove ordering +checkpoint consistency +``` + +Log ordering can remain serialized even if independent metatree mutations become +parallel internally. + +## Verification + +Current first-step build verification: + +```bash +cmake --build bld --target metaserver -j4 +``` + +The target builds successfully. + +## Risk Notes + +The safe first step may improve throughput if the dispatch mutex was the main +contention point. If the bottleneck is now the metadata request mutex or log +writer, throughput may not improve significantly. + +Do not replace the metadata request mutex with a parent-directory lock until the +global B-tree has been refactored or otherwise proven safe for concurrent +mutation. diff --git a/scripts/run_mstress_create.sh b/scripts/run_mstress_create.sh new file mode 100755 index 000000000..a7d7fcf81 --- /dev/null +++ b/scripts/run_mstress_create.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd /work/bigo-qfs + +PLAN_FILE="${1:-output/mstress_1m_after_lock.plan}" +QFS="bld/output/bin/tools/qfs" +QFS_CFG="qfsbase/client/clidefault.prp" +MSTRESS="bld/benchmarks/mstress/mstress.py" +META_HOST="localhost" +META_PORT="20000" + +if [ ! -f "${PLAN_FILE}" ]; then + echo "Plan file not found: ${PLAN_FILE}" >&2 + exit 1 +fi + +echo "Checking metaserver..." +bld/output/bin/tools/qfsping -m -s "${META_HOST}" -p "${META_PORT}" + +echo "Cleaning /mstress..." +"${QFS}" \ + -D dfs.force.remove=true \ + -cfg "${QFS_CFG}" \ + -rmr /mstress >/dev/null 2>&1 || true + +echo "Running create benchmark with plan: ${PLAN_FILE}" +python "${MSTRESS}" \ + -m slave \ + -f qfs \ + -s "${META_HOST}" \ + -p "${META_PORT}" \ + -t create \ + -a "${PLAN_FILE}" \ + -c localhost \ + -k localhost + +echo "Summary:" +rg -n "paths created|failed|ERROR|FATAL" "${PLAN_FILE}"* 2>/dev/null || true diff --git a/scripts/start_clean_metaserver.sh b/scripts/start_clean_metaserver.sh new file mode 100755 index 000000000..cbfcd1db6 --- /dev/null +++ b/scripts/start_clean_metaserver.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd /work/bigo-qfs + +META_BIN="bld/output/bin/metaserver" +META_CONF="qfsbase/meta/conf/MetaServer.prp" +META_LOG="qfsbase/meta/MetaServer.log" +META_OUT="qfsbase/meta/MetaServer.out" +META_BASE="qfsbase/meta" +TS="$(date +%Y%m%d_%H%M%S)" + +stop_pid_file() { + local pid_file="$1" + if [ -f "${pid_file}" ]; then + local pid + pid="$(cat "${pid_file}" || true)" + if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then + kill "${pid}" || true + fi + rm -f "${pid_file}" + fi +} + +echo "Stopping existing metaserver, if any..." +stop_pid_file "${META_BASE}/metaserver.pid" +pkill -f "${META_BIN} ${META_CONF}" 2>/dev/null || true + +echo "Stopping existing chunkservers, if any..." +for idx in 1 2 3; do + stop_pid_file "qfsbase/chunk${idx}/chunkserver.pid" +done +pkill -f "bld/output/bin/chunkserver qfsbase/chunk" 2>/dev/null || true +sleep 1 + +echo "Archiving old meta logs/checkpoints..." +mkdir -p "${META_BASE}" +if [ -d "${META_BASE}/logs" ]; then + mv "${META_BASE}/logs" "${META_BASE}/logs.bak.${TS}" +fi +if [ -d "${META_BASE}/checkpoints" ]; then + mv "${META_BASE}/checkpoints" "${META_BASE}/checkpoints.bak.${TS}" +fi +mkdir -p "${META_BASE}/logs" "${META_BASE}/checkpoints" + +echo "Archiving old chunkserver local state..." +for idx in 1 2 3; do + CHUNK_BASE="qfsbase/chunk${idx}" + for path in "${CHUNK_BASE}"/chunkdir*; do + if [ -d "${path}" ]; then + mv "${path}" "${path}.bak.${TS}" + mkdir -p "${path}" + fi + done +done + +echo "Creating clean filesystem..." +"${META_BIN}" \ + -c \ + "${META_CONF}" \ + "${META_LOG}" \ + > "${META_OUT}" 2>&1 + +echo "Starting metaserver..." +setsid -f "${META_BIN}" \ + "${META_CONF}" \ + "${META_LOG}" \ + >> "${META_OUT}" 2>&1 + +sleep 2 + +echo "Starting chunkservers..." +for idx in 1 2 3; do + setsid -f bld/output/bin/chunkserver \ + "qfsbase/chunk${idx}/conf/ChunkServer.prp" \ + "qfsbase/chunk${idx}/ChunkServer.log" \ + > "qfsbase/chunk${idx}/ChunkServer.out" 2>&1 +done + +sleep 5 + +echo "Process:" +ps -ef | awk "/bld\/output\/bin\/metaserver|bld\/output\/bin\/chunkserver/ && !/awk/ {print}" + +echo "Ping:" +bld/output/bin/tools/qfsping -m -s localhost -p 20000 From d11e34d9fdf287a3067d9e5fede7baba3f369b14 Mon Sep 17 00:00:00 2001 From: zhangzhibiao Date: Mon, 25 May 2026 17:58:17 +0800 Subject: [PATCH 4/7] add rfc --- .../RFC-0001-memory-native-metadata-layer.md | 908 ++++++++++++++++++ 1 file changed, 908 insertions(+) create mode 100644 docs/rfc/RFC-0001-memory-native-metadata-layer.md diff --git a/docs/rfc/RFC-0001-memory-native-metadata-layer.md b/docs/rfc/RFC-0001-memory-native-metadata-layer.md new file mode 100644 index 000000000..6074bfe2c --- /dev/null +++ b/docs/rfc/RFC-0001-memory-native-metadata-layer.md @@ -0,0 +1,908 @@ +# RFC-0001: 内存原生元数据层(Memory-Native Metadata Layer) + +| 字段 | 值 | +|------|-----| +| **状态** | Draft | +| **日期** | 2026-05-25 | +| **相关** | QFS `metatree`(B+ 树)、`LogWriter`、HDFS NameNode edit log + FSImage | +| **动机来源** | CREATE 延迟分析、与 HDFS NN 路径对比、绿场元数据设计讨论 | + +--- + +## 摘要 + +本 RFC 提议为 QFS 类分布式文件系统定义一套**从 0 设计的内存原生元数据层**:命名空间用 **按目录分片的哈希索引 + 全局 inode 表** 维护,持久化采用 **edit log + HDFS 式用户快照(引用/COW)+ 周期性 Checkpoint(FSImage)**,与当前 **单一 B+ 树(`metatree`)+ 先 WAL 后改树** 的实现路线对比,并给出实现与分阶段交付路径(**不含**从现有 B+ 树/checkpoint 的迁移方案)。 + +目标是在可比持久化语义下,将 **`create` / `lookup` 的热路径** 从「多次 B+ 树 descent + 双 insert + 全局串行」收敛为「O(1) 内存索引 + 摊销组提交 fsync」,并保留 VR/幂等等生产特性。 + +--- + +## 1. 背景与动机 + +### 1.1 当前 QFS 元数据路径(CREATE) + +空文件 `CREATE` 的典型路径: + +```text +ClientThread → submit_request() [全局互斥] + → MetaCreate::start() [校验,不改树] + → LogWriter::Enqueue() [写 transaction log] + → (log committed 后)MetaCreate::handle() + → Tree::create() + → getFattr(parent) [B+ 树查找] + → lookup(parent, name) [B+ 树查找] + → link() + → insert(MetaDentry) [B+ 树插入 #1] + → insert(MetaFattr) [B+ 树插入 #2] +``` + +特征: + +- 命名空间、dentry、fattr、chunkinfo 混在同一棵 **B+ 树**(`kfstree.h` 明确为 B+ 树)。 +- **先 durable log,再** 修改内存树(`SubmitBegin` + `LogWriter` 队列)。 +- `MetaIdempotentRequest` 将 `logAction` 设为 **`kLogAlways`**,成功路径几乎必落 log。 +- 核心处理在 **`submit_request` 全局锁** 下串行(见 `NetDispatch.cc` 注释)。 + +### 1.2 HDFS NameNode 对照 + +HDFS 将问题拆成两层: + +| 层 | 实现要点 | +|----|----------| +| 运行时 | 全内存 inode 树;目录下按名索引(hash/map);`create` 主要为内存挂接 | +| 持久化 | Edit log(操作记录)+ FSImage/checkpoint;fsync 可组提交 | + +Chunk 分配通常在 **首次 write / addBlock**,而非 `create` 本身,因此 NN 上 `create` 常数项小。 + +### 1.3 结论 + +QFS **并非不能** 采用 HDFS NN 式布局;当前选择是 **统一 B+ 树 + 树形 checkpoint** 的历史工程路线。在 CREATE 延迟与元数据 QPS 成为瓶颈时,绿场或下一代元数据层值得单独设计,而非仅在 B+ 树上做局部锁优化(参见仓库内 `MetaTree-Lock-Optimization.md`)。 + +--- + +## 2. 目标与非目标 + +### 2.1 目标 + +1. **`create` / `lookup`(按 fid + name)**:小目录 **O(1) 均摊**(Small);超大目录 **O(log N)**(Large,§4.2);均无全局 `metatree` descent。 +2. **持久化**:edit log + §6.3 用户快照 + §6.4 Checkpoint;支持 **`sync=none | batch | always`** 三档。 +3. **吞吐**:通过 **目录分片锁 + log 单写线程组提交** 提升并行度。 +4. **语义**:保留客户端 **父目录 fid**、幂等 `(session_id, op_id)`、VR/quorum 复制(与现有 MetaServer 部署模型兼容)。 +5. **规模(单机)**:假定 **单个 MetaServer 进程、单机 RAM 容纳全部 namespace**(`DirIndex` + `InodeTable` + 可选 `BlockMap`);容量规划为运维/部署话题,本 RFC 不定义上限模型。 +6. **大目录(首版必做)**:单目录 **百万级** 子项时,`lookup` / `create` / `readdir` 不得退化为「单哈希桶长链遍历」;须使用 §4.2 的 **Small/Large 双布局** 与 **升格(promotion)** 机制。 + +### 2.2 非目标(本 RFC 首版) + +- 不定义 chunk 数据路径、纠删码、LayoutManager 细节(仅要求 **BlockMap 与 namespace 解耦**)。 +- 不替换现有 ChunkServer 协议。 +- **不考虑**从现有 B+ 树 / checkpoint / transaction log 的**离线迁移、在线双写、回滚**(若需要,另起 RFC)。 +- **不考虑**元数据 **水平分片**(多 MetaServer 各管一段 namespace)、**冷 inode / namespace 换出**、单机内存超限后的分级存储(另起 RFC)。 +- 不实现完整 POSIX(符号链接、硬链接语义等可后续 RFC 补充)。 + +本 RFC 假定 **绿场部署**:新集群以 v2 snapshot + edit log **冷启动**,或与现行 `metatree` 并行存在、不互通。 + +文中 **「分片」** 若无特别说明,均指 **单进程内** 的锁分片 / `hash(fid) % N` 数据结构分片,**不是** 集群级 namespace 分片。 + +--- + +## 3. 提议架构 + +### 3.1 逻辑分层 + +```text +┌─────────────────────────────────────────────────────────┐ +│ RPC 层:CREATE / LOOKUP / READDIR / REMOVE / RENAME … │ +└───────────────────────────┬─────────────────────────────┘ + │ +┌───────────────────────────▼─────────────────────────────┐ +│ 内存权威层(Authoritative In-Memory State) │ +│ • Namespace:DirTable[parent_fid] → DirNode(Small/Large)│ +│ • InodeTable[fid] → Inode(属性、parent、类型、计数) │ +│ • BlockMap[fid] → chunk 列表(可选独立模块/服务) │ +│ • (无服务端 PathCache;路径缓存仅在客户端,见 §4.3) │ +└───────────────────────────┬─────────────────────────────┘ + │ 仅追加操作记录 +┌───────────────────────────▼─────────────────────────────┐ +│ 持久化层 │ +│ • Edit Log(二进制 op,组提交 fsync) │ +│ • 用户快照:HDFS 式引用 + 文件级 COW(§6.3) │ +│ • Checkpoint/FSImage:一致性点 N + 后台遍历(§6.4) │ +│ • Quorum / VR 复制(复用现有 LogWriter/VR 基础设施) │ +└─────────────────────────────────────────────────────────┘ +``` + +### 3.2 与 QFS 现状的核心差异 + +| 维度 | QFS 现状 | 本 RFC | +|------|----------|--------| +| 主索引 | 全局 B+ 树,dentry/fattr 不同 key | DirNode(Small hash / Large **复用 `kfstree` 每目录一棵 `Tree`**)+ InodeTable | +| CREATE 索引操作 | 2× 全局 `insert` + 多次 `findLeaf` | 1× DirNode insert + 1× InodeTable insert | +| 百万级单目录 | 同全局树叶子链/同桶冲突风险 | Large 布局 O(log N),首版必做 promotion | +| 持久化顺序 | 先 WAL committed,再 `handle()` | 临界区内改内存 + append log buffer;fsync 摊销 | +| 用户快照 | (现 QFS 无同等机制) | `InodeRef` + 文件级 COW,创建 O(1)(§6.3) | +| checkpoint | B+ 树页/节点序列化 | 模糊 FSImage(§6.4)+ replay txn>N | +| chunk 元数据 | 同树 `KFS_CHUNKINFO` | BlockMap 分离,allocate 时再写 | + +--- + +## 4. 内存数据结构 + +### 4.1 Inode + +```text +Inode { + fid: u64 // 全局唯一,单调分配(单机) + type: file | dir | symlink + parent_fid: u64 // 父目录;根目录哨兵 + mode, uid, gid, size, mtime, ctime, atime + nlink, flags // 见 §8.4:WORM、dumpster 子树、striping 等 + snapshottable: bool // 目录可打快照(§6.3) + snap_ref_count: u32 // 被用户快照持有的 frozen 引用数(§6.3.6);live inode 常为 0 + replication | ec_policy // 或仅指针,详细布局在 allocate 时设置 + dir_child_count // 仅目录;用于 readdir 分页提示 + generation: u64 // 每次 rename/unlink/rmdir/promotion 递增,供 cache 失效 +} +``` + +存储:`InodeTable` 为 `fid → Inode`,数组分片或 `flat_hash_map` 分片。 + +### 4.2 目录索引 DirIndex(含大目录,首版必做) + +**问题**:若每个目录仅用一个全局 `HashMap` 且冲突用链表串接,则单目录 **百万文件** 时会出现极端哈希桶长链,`lookup` / `create` / `readdir` 退化为 **O(N)**,成为新瓶颈。 + +**决策**:每个目录一个 **`DirNode`**,首版即实现 **Small(哈希)+ Large(有序索引)** 两种布局,并在子项数超过阈值时 **强制升格**(promotion),不是后续可选优化。 + +#### 4.2.1 公共类型 + +```text +NameKey = (name_hash: u64, name: string) // name_hash = Hsieh(name)<<4,与现 MetaDentry 一致 +DirEntry = { child_fid, name } +DirNode = { + state: SMALL | PROMOTING | LARGE // 见 §4.2.6 + generation: u64 // promotion 完成后递增,失效 readdir cookie + child_count: u64 + body: SmallDir | LargeDir // PROMOTING 期间读者只访问 Small + staging: LargeDir? // 仅 PROMOTING:构建完成前对读者不可见 +} +``` + +全局:`DirTable[parent_fid] → DirNode`(按 `parent_fid` 分片锁,§7;晋升见 §4.2.6)。 + +#### 4.2.2 Small 布局(子项数 < `dir_large_threshold`) + +- 结构:**开放寻址** `flat_hash_map`(Robin Hood 或等价),**禁止**无限链表冲突链。 +- **`lookup` / `create`**:均摊 **O(1)**;探测次数有硬上限 `max_probe`(如 16),插入时若接近满负荷或探测失败则 **触发升格** 而非继续堆链。 +- 默认阈值 **`dir_large_threshold = 4096`**(可配置 `meta.dir.largeThreshold`)。 + +#### 4.2.3 Large 布局(子项数 ≥ 阈值,或 Small 无法安全插入) + +**决策:Large 布局直接复用当前 QFS B+ 树实现**(`kfstree.h` / `kfstree.cc`),不新写一套目录 B-tree。与全局 `metatree` 的差异仅是 **每目录一棵独立 `Tree` 实例**,键空间 scoped 在该 `parent_fid` 下。 + +| 复用组件 | 路径 / 说明 | +|----------|-------------| +| 内部节点 | `Node`(`NKEY=170`,4096B 页式节点,`findplace` 二分,`split` / `merge`) | +| 树操作 | `Tree::insert`、`Tree::del`、`lowerBound` / `findLeaf`、`LeafIter` | +| 键 | 现有 `Key` / `PartialMatch`;叶键 **`Key(KFS_DENTRY, parent_fid, name_hash)`**,与现 `MetaDentry::keySelf()` 一致 | +| 叶记录 | `MetaDentry`(或薄封装 `DirBTreeLeaf` 内嵌相同字段);`matchSelf` 比对 `name` | +| 内存 | `MetaNode::allocate` / `PoolAllocator`(与现 meta 节点相同) | + +```text +LargeDir { + parent_fid: fid_t + tree: Tree // 现 kfstree.Tree,非全局 metatree 单例 +} +``` + +- **语义**:逻辑上仍是「该目录下 name → child_fid」;物理上用 **一棵子树** 存该目录全部 `MetaDentry` 叶,**不再**插入全局 `metatree` 的混合 key 空间。 +- **`lookup` / `create`**:对该目录的 `Tree` 调用与现 `getDentry` / `insert` 相同逻辑(`findLeaf` + 叶链 `peer()` 扫同名 hash),**O(log N)**,无百万长链。 +- **`readdir`**:`LeafIter` 逻辑序遍历 + §5.4 **逻辑位置 cookie**(禁止裸指针)。 +- **升格(promotion)**:原子性与并发语义见 **§4.2.6**(`PROMOTING` 状态、写阻塞、读仍用 Small、staging 完成后一次性切换)。 +- **checkpoint / fsck**:Large 目录序列化可 **复用现 Node/Meta checkpoint 格式**;fsck 见 §8.5(`PROMOTING` 视为 transient,持久化快照中不应出现)。 + +**不新写**:单独的目录 B-tree 节点类型、另一套 split/merge 或不同于 `Node` 页大小的树实现。 + +#### 4.2.4 复杂度与验收(百万级单目录) + +| 操作 | Small | Large | +|------|-------|-------| +| lookup | O(1) 均摊,探测有界 | O(log N) | +| create | O(1) 均摊或触发 O(N) 一次性 promotion | O(log N) | +| readdir 一页 | O(page) 或扫描有界桶 | O(log N + page) | + +**禁止**:单桶链表长度 ∝ N、百万次指针追逐的「伪 O(1) 哈希表」。 + +#### 4.2.5 与全局 `metatree` 的关系 + +| | 全局 `metatree`(现 QFS) | Large `DirNode`(本 RFC) | +|--|---------------------------|---------------------------| +| 代码 | `kfstree` | **同一套** `kfstree` | +| 实例 | 单例 `metatree`,混放 dentry/fattr/chunk | **每超大目录一个 `Tree`** | +| create 副作用 | 可能 split 共享祖先内部节点 | 仅影响该目录子树 | +| 小目录 | 也走全局树 | **Small `flat_hash`**,不进 B+ 树 | + +InodeTable、BlockMap **不再**进入任何 B+ 树;仅 **超大目录的子项列表** 使用 `Tree` 存 `MetaDentry` 叶。 + +#### 4.2.6 晋升(Promotion)的原子性与可见性(已决) + +**问题**:§4.2.3 若在「半建成」的 Large `Tree` 上并发 `lookup`/`create`,可能看到 **不完整** 的 B+ 树或 Small/Large 双写混乱。 + +**决策**:`DirNode` 增加 **`state`**;晋升在 **staging** 中构建 Large,通过 **一次性发布** 切换;晋升期间 **读走 Small、写阻塞或排队**。 + +##### 状态机 + +```text +SMALL ──(触发晋升)──► PROMOTING ──(发布完成)──► LARGE + │ + └── 失败回滚 ──► SMALL(见下) +``` + +| `state` | 读者 (`lookup`/`readdir`) | 写者 (`create`/`unlink`/`rename` 子项) | +|---------|---------------------------|----------------------------------------| +| **SMALL** | `body.small` | 正常;可能触发进入 PROMOTING | +| **PROMOTING** | **仅** `body.small`(不读 `staging`) | **阻塞**于 `promote_cv` 或同目录写队列,直到 `LARGE` | +| **LARGE** | `body.large.tree` | 正常 `kfstree` 路径 | + +##### 晋升算法(持有 `DirTable[parent]` 互斥或写锁) + +```text +promote_small_to_large(parent_fid): + lock(dir) // 目录分片写锁;阻塞其它写者,读者见下 + + 1. assert(state == SMALL) + 2. state = PROMOTING + 3. staging.large = new Tree() // 读者不可见 + 4. for entry in body.small: // 只读 Small,不改 Small + staging.large.insert(MetaDentry(...)) + 5. // 一次性发布(原子切换可见布局) + body.large = move(staging.large) + staging = null + free(body.small) + state = LARGE + generation++ // 失效 readdir cookie / 客户端 path 缓存 + 6. broadcast(promote_cv) // 唤醒排队写者 + 7. append EditLog(DIR_PROMOTE, parent_fid, generation) + unlock(dir) +``` + +- **「原子」含义**:在步骤 5 之前,任何 RPC **不可能** 观察到 `staging` 或半填充的 `body.large`;步骤 5 之后,**不可能** 再观察到 `body.small`。 +- 实现上可用 **同一把目录锁** 包裹步骤 2–6;步骤 5 的字段赋值顺序:`staging` 清空 → `body.large` 生效 → `state=LARGE` → 释放 `small`(避免读者看到 `LARGE` 但 body 仍为空)。 + +##### 并发 `lookup` / `create`(与 §7 分片锁配合) + +| 操作 | `state == PROMOTING` 时行为 | +|------|---------------------------| +| **lookup** | 获取目录 **读锁**(或与写互斥的 `shared_lock`):读 **`body.small` 快照**,与晋升线程不共享写;晋升 **不修改** Small,只读遍历。 | +| **readdir** | 同 lookup;cookie 若带旧 `generation`,晋升完成后返回 **失效**,客户端重试。 | +| **create** | 需目录 **写锁**:若 `PROMOTING`,**等待** 晋升完成(`promote_cv`),不得在半成品 Large 上 insert。 | +| **触发晋升的 create** | 当前线程持写锁执行 `promote_small_to_large`,完成后在同一锁内对 **Large** 执行 insert。 | + +**不采用**:晋升过程中对活动 RPC 暴露「部分迁移」的 Large;不采用无 `PROMOTING` 标记、原地边建树边切换 `layout` 字段。 + +##### 失败与恢复 + +- 若步骤 4 失败:`state` 回滚 **SMALL**,丢弃 `staging`,`generation` 不变,唤醒等待者并返回错误。 +- Edit log 仅在 **成功** 步骤 7 记录 `DIR_PROMOTE`;replay 时目录应已为 **LARGE**(或从 snapshot 还原 layout 字段)。 +- §6.4 Checkpoint 扫描时:若发现 `PROMOTING`(崩溃中间态),按 **SMALL** 序列化并打标需 **重做 promotion** 或 fsck 修复(运维策略,首版可 panic 要求重放 log 修复)。 + +##### 与 §5 热路径的衔接 + +```text +DirTable[parent].lookup(name): + lock_shared(dir) + switch (state): + SMALL | PROMOTING → return body.small.find(name) + LARGE → return body.large.tree.lookup(...) + +DirTable[parent].insert(name, child_fid): + lock_exclusive(dir) + while (state == PROMOTING) wait(promote_cv) + if state == SMALL && need_promote(): promote_small_to_large() // 仍持写锁 + ... insert into active body ... +``` + +### 4.3 路径缓存(已决:仅客户端) + +**决策:不在 MetaServer 集群内维护、复制或共享 PathCache**(对比现 QFS 可选的 `metaServer.enablePathToFidCache`,本设计 **不** 在服务端做路径→fid 缓存)。 + +| 侧 | 职责 | +|----|------| +| **客户端** | 维护 `path → fid`、`parent_fid` 等缓存;热路径用 **fid + name** 发 RPC,避免 `LOOKUP_PATH`。 | +| **服务端** | 不存 PathCache;`lookup` / `LOOKUP_PATH` 每次按 `DirIndex` 解析。通过 RPC 响应携带 **`generation`**(目录或 inode 上的单调版本),供客户端判断缓存是否失效。 | + +**目录 `generation`**(§4.1 Inode):在 `rename` / `rmdir` / `unlink` / 子树变更时递增;客户端比对 `(path, cached_fid, cached_generation)`,不一致则丢弃该路径缓存项并重新解析。 + +**失效规则(客户端本地)**: + +- 单文件 `remove` / `rename`:失效该路径及已知子路径前缀(若有目录缓存树)。 +- 目录 `rmdir`:失效以该路径为前缀的全部缓存项。 +- 收到别客户端 mutating 成功且本地无 generation 时:可保守失效父目录缓存,或依赖后续 `LOOKUP` 失败再刷新。 + +**不采用**:MetaServer 间复制 path cache、standby 只读副本提供缓存命中、或全局 `PathToFidCacheMap`(避免一致性、失效广播与内存占用问题)。 + +### 4.4 BlockMap(与 namespace 分离) + +```text +BlockMap : 按 fid 分片 + fid → [ ChunkInfo { chunk_id, offset, version, locations, tier } ] +``` + +- **`create` 不写入 BlockMap**(与 HDFS 一致)。 +- **`allocate` / `append`** 才追加 chunk 记录;edit log 使用独立 op 类型。 + +--- + +## 5. 热路径算法 + +### 5.1 CREATE(空文件) + +**前置**:客户端提供 `parent_fid` + `name`(已有 QFS `MetaCreate::dir`)。 + +```text +1. shard = hash(parent_fid) % N_SHARDS +2. lock(DirShard[shard]) +3. if DirIndex[parent].contains(name) → 处理 exclusive / truncate 语义 +4. new_fid = FidAllocator.next() +5. DirTable[parent].insert(name, new_fid) // §4.2,必要时 promotion +6. InodeTable[new_fid] = Inode{ parent, attrs... } +7. update parent.mtime, parent.file_count +8. txn = EditLog.append(CREATE, parent, name, new_fid, attrs, op_id) +9. unlock +10. if sync_policy == always: wait(txn.committed) +11. return new_fid +``` + +**树操作次数**:0。持久化:1 条 edit(组提交时与其他 op 共享一次 fsync)。 + +### 5.2 LOOKUP(单级) + +```text +lock(DirShard[hash(parent)]) + entry = DirTable[parent].lookup(name) + fa = InodeTable[entry.fid] +unlock +→ 权限检查 +``` + +### 5.3 LOOKUP_PATH + +**服务端**无 PathCache:按 `/` 分段,**每段一次 DirIndex 查找**,最后一段做 access check。 +**客户端**应先查本地 path 缓存(§4.3);未命中再发 `LOOKUP_PATH` 或分段 `LOOKUP`(持 `parent_fid`)。 + +### 5.4 READDIR + +```text +readdir(parent, cookie, max_entries) → 分页返回 DirEntry 列表 +``` + +| `DirNode.state` | 遍历方式 | cookie 概要 | +|-----------------|----------|-------------| +| **SMALL** | 桶序 + 桶内序 | 逻辑位置(§5.4.1) | +| **PROMOTING** | 仍按 Small | 同 SMALL;`generation` 未变 | +| **LARGE** | B+ 树 key 序(`kfstree`) | **逻辑 key 游标**,禁止节点指针 | + +- 每次 RPC 仅返回 **≤ max_entries**(默认上限如 1024,可配置)。 +- 禁止:一次 RPC 返回百万项;禁止 Large 布局下无序全表扫描。 + +#### 5.4.1 Readdir Cookie 鲁棒性(已决) + +**问题**:若 Large 布局 cookie 编码 **`LeafIter` 内部物理状态**(`Node*`、叶内下标),则两次 `readdir` 之间对该目录的 **`insert`/`del` 导致 B+ 树 split/merge** 后,cookie 可能 **失效或指错位置**(重复、漏项)。目录已为 **LARGE** 时不会发生 promotion,但 **树重平衡仍会发生**。 + +**决策**:cookie 表示 **逻辑遍历位置**,不绑定可变物理指针;对齐 HDFS「续传令牌 = 逻辑名 / 有序 key」思路。 + +##### 硬性规则 + +| 规则 | 说明 | +|------|------| +| **禁止** | cookie 中序列化 `Node*`、堆地址、`LeafIter` 内存指针 | +| **必须** | 可由 `(parent_fid, generation, resume_key)` 在当前树上 **重新定位** | +| **`generation` 不匹配** | 返回 `EINVAL` / 空 cookie 重启;客户端全量重扫该目录 | + +##### SMALL / PROMOTING + +```text +CookieSmall = { + generation: u64 + layout: SMALL | PROMOTING + bucket_id: u32 // 开放寻址桶序号(稳定枚举顺序) + slot: u32 // 桶内下一起始槽位 +} +``` + +- 仅在 **同一 `generation`、同一 Small 布局** 下有效;**promotion 完成** 后 `generation++`,旧 cookie **作废**(切换为 Large cookie 或从头)。 + +##### LARGE(推荐:逻辑 key 游标) + +**首选**(实现简单、对 split/merge 最稳): + +```text +CookieLarge = { + generation: u64 + layout: LARGE + after_hash: u64 // 上一页最后一条的 name_hash + after_name: bytes // 上一页最后一条的文件名(字典序续扫) +} +``` + +恢复算法: + +```text +readdir_resume(parent, cookie): + if cookie.generation != DirNode.generation: INVALID + key = Key(KFS_DENTRY, parent, cookie.after_hash) + it = lowerBound(tree, key) // 现 kfstree + skip entries where (hash,name) <= cookie.after_name lexicographically + return next max_entries from it (LeafIter 仅作实现手段,不写入 cookie) +``` + +- B+ 树 **split/merge 不改变 key 的全序**;只要条目未被删除,续扫位置仍正确。 +- **并发 insert**:新名可能插在已扫过区间之前 → 客户端可能漏扫;与 HDFS 一致,**不保证** 遍历期间快照隔离;强一致列举需 **`generation` 冻结** 或 copy-on-read(**非首版**)。 +- **并发 delete**:已返回的名字可能已不存在;续扫 `lowerBound` 自然跳过。 + +**可选**(与 HDFS 部分实现类似,需稳定叶 id): + +```text +CookieLargeAlt = { generation, leaf_node_id, index_in_leaf } +``` + +- `leaf_node_id` 为 **分配的稳定叶标识**(split 时子叶继承/拆分规则须在 RFC 实现细则中定义),**不是**运行时指针。 +- 恢复时若 `leaf_node_id` 已合并/分裂:**从该 id 映射节点的最小 key**,或 **`lowerBound(该 key)` 的下一个有效叶** 继续,**宁可少量重复不可漏**(与建议一致)。 +- 首版 **优先 `after_name` 游标**;`leaf_node_id` 方案可在性能优化阶段引入。 + +##### 与 promotion / mutation 的交互 + +| 事件 | cookie 行为 | +|------|-------------| +| **promotion 完成** | `generation++`;Small cookie **失效**;客户端用空 cookie 对 Large 重扫 | +| **rename/unlink/rmdir(目录)** | `generation++`;所有 cookie 失效 | +| **Large 上 create/delete** | `generation` 可不变;**`after_name` cookie 仍有效**(靠 key 重定位);若产品要求列举快照视图,另议 | +| **返回 `-EBADF`/`EINVAL`** | 客户端 **丢弃 cookie,从空重新开始** | + +##### RPC 响应 + +- 每页返回:`entries[]`、`more_entries`、`next_cookie`(编码上述结构,版本号 `cookie_ver=1`)。 +- 不把 `LeafIter` 状态暴露给客户端。 + +**验收(P2)**:在 Large 目录连续 `readdir` 分页过程中注入随机 `insert`/`del`,验证无指针 cookie 时 **无崩溃、无无限循环**;允许与 HDFS 相同的「并发修改下不保证严格快照列举」语义。 + +--- + +## 6. 持久化设计 + +### 6.1 Edit Log 记录格式(概念) + +采用 **定长头 + 变长 payload** 的二进制编码(避免 QFS 部分文本 token 解析开销): + +```text +Record { + magic, version + txn_id: u64 // 单调 + op: u16 // CREATE=1, REMOVE=2, MKDIR=3, ... + op_id: u128 // 幂等键 (client_id, seq) + payload: op-specific +} +checksum per block / per record +``` + +**CREATE payload 示例字段**:`parent_fid, name, new_fid, mode, uid, gid, replication, ...` + +### 6.2 组提交(Group Commit) + +| 模式 | 行为 | +|------|------| +| `batch`(默认) | 每 `commit_interval_ms` 或 `commit_batch_bytes` 一次 `fdatasync` | +| `always` | 每个 txn 等待 fsync(兼容强一致测试) | +| `none` | 仅写 page cache,崩溃可能丢最近操作(需明确禁用场景) | + +**Log 线程模型**:单写者 append + fsync;namespace 分片锁与 log 锁分离,缩短临界区。 + +### 6.3 用户快照(已决:HDFS 式引用 + 文件级 COW) + +**放置说明**:本节描述 **Snapshottable 目录上的用户可见快照**(类比 HDFS `createSnapshot`),与 §6.4 **周期性 Checkpoint/FSImage**(NN 冷备)分工不同。实现可落在 **P3/P3.1**(§9)。 + +**决策:创建快照采用 HDFS 核心思路——引用(Rename/Reference)而非复制;修改采用文件级写时复制(COW)。** 不采用对整棵树做全量内存扫描来「创建」用户快照(该做法保留给 §6.4 Checkpoint)。 + +#### 6.3.1 核心机制(对齐 HDFS) + +| HDFS 概念 | 本 RFC 映射 | +|-----------|-------------| +| `INodeReference` | **`InodeRef`**:快照目录上的一个轻量引用,指向某 **`fid`(目录或文件根)** 在 `committed_txn_id = N` 时的逻辑视图 | +| 创建快照 O(1) | 在 snapshottable 目录 `D` 上新增 `Snapshot{s_id}` → 仅增加 **Ref → D 的 inode/目录状态**,**不**复制百万子项 | +| 读快照 | 沿 Ref 解析路径;**无锁读**(读快照侧为只读视图) | +| 首次修改被快照覆盖的文件 | **文件级 COW**:保留旧 `Inode`+`BlockMap` 给快照;活动命名空间新建 `Inode`(新 fid 或新 inode 行)并更新 **该名字** 在 `DirTable` 中的映射 | +| 修改目录下其他未触碰文件 | **零开销**(Ref 仍指向原 inode;活动 DirTable 不变) | + +```text +allowSnapshot(dir_fid) // 标记目录可打快照(类比 snapshottable) +createSnapshot(dir, name) // 例如 /foo → s1 + → SnapshotRecord { id, parent_snap, root_ref → inode@N } // O(1),无子树复制 + +// 用户 delete / truncate / 覆盖写 / rename 活动树中的 file1,且 file1 在 s1 覆盖下: +mutate(file1): + if inode_snapshotted(file1): + cow_inode(file1): + frozen = clone_inode_shallow(file1) // 快照保留 + live = new_inode_for_mutation() // 活动命名空间 + DirTable[parent].replace_name(file1 → live) + append EditLog(COW_SPLIT, ...) + else: + normal_mutate(file1) +``` + +- **目录级百万文件**:创建 `s1` **不遍历** `DirTable`;仅在被修改的单个文件上支付 COW(约一次 create + 后续 write 的元数据开销)。 +- **Large 目录**:COW 只 **`replace_name` 一条 DirEntry**(Small 或 `kfstree` 单键更新),不重扫整棵 per-dir `Tree`。 + +#### 6.3.2 性能预期(与 HDFS 对照) + +| 场景 | 性能 | 原因 | +|------|------|------| +| 读活动/读快照文件 | 快照读无额外锁;活动读与无快照相同 | Ref 只读解析 | +| 创建 / 删除快照 | **近似 O(1)** | 仅增删 `SnapshotRecord` / `InodeRef` | +| 首次修改快照覆盖下的文件 | 有开销(COW 一个 inode) | 与被修改文件数成正比,与目录总规模无关 | +| 再次修改已 COW 过的活动文件 | 与无快照相同 | 已操作活动侧新 inode | + +#### 6.3.3 数据结构(与 §4.1 衔接) + +```text +SnapshotRecord { + snap_id, name, root_dir_fid, txn_id_at_create: N + root_ref: InodeRef // O(1) 创建:指向 snapshottable 根目录 inode + cow_inodes: set // 可选:本快照触发的 frozen fid 登记,便于 delete 时递减 +} + +InodeRef { target_fid, txn_id_cap } +``` + +- **`snap_ref_count`** 定义在 §4.1 `Inode` 上:表示有多少 **独立快照引用** 仍依赖该 **inode 对象**(通常为 COW 后的 **frozen** 副本;活动/live inode 在分裂后一般为 0)。 +- Edit log:`SNAPSHOT_CREATE`、`SNAPSHOT_DELETE`、`INODE_COW_SPLIT`(含 `frozen_fid`、`live_fid`、`snap_ref_delta`),供 standby **确定性 replay**。 + +#### 6.3.6 Frozen inode 引用计数(已决) + +**问题**:§6.3 删除快照时「仅回收本快照专属的 frozen inode」。若同一 frozen inode 被 **多个快照** 引用(例如 `/foo` 上连续创建 `s1`、`s2` 后才首次修改 `file1`),**不能在 `snap_ref_count > 0` 时释放**。 + +**决策**:在 `Inode` 上维护 **`snap_ref_count`**;在 **COW 分裂** 与 **删除快照** 时严格增减;减到 **0** 才可回收该 frozen inode(及对应 `BlockMap`)。 + +##### 何时增减(与 HDFS 文件级 COW 对齐) + +| 事件 | `snap_ref_count` | 说明 | +|------|------------------|------| +| **`createSnapshot`** | 根目录 `root_ref.target` **+1**(可选) | 创建本身 O(1);**不**遍历子树给每个文件 +1。未 COW 的文件仍与 live 共用同一 `fid`,读快照走解析路径。 | +| **首次 `COW_SPLIT`(file1)** | 对 **frozen_fid**(旧 inode 副本)设为 **覆盖该文件的所有活跃快照数** `K` | 例:存在 `s1`、`s2` 均可见 `file1` 时尚未修改 → `frozen.snap_ref_count = 2`。活动侧新 `live_fid`:`snap_ref_count = 0`。 | +| **再建快照 `s3`(已有 frozen file1)** | 若 `s3` 仍指向含 `file1` 的视图且 `file1` 已 frozen:对 `frozen_fid` **+1** | 仅影响 **已分裂** 的 frozen 对象;仍与 live 共用的路径在首次 COW 时一次性结算。 | +| **`deleteSnapshot(s)`** | 对该快照登记过的每个 `frozen_fid`:**-1** | 来自 `cow_inodes` 或快照元数据索引;**仅当减到 0** 时 `free_inode(frozen_fid)` + 释放 BlockMap | +| **活动路径修改 live inode** | 不增减 | live 与快照引用解耦 | + +```text +cow_split(file_fid, parent, name): + frozen_fid = retain_or_clone_inode(file_fid) // 旧版本留给快照 + live_fid = allocate_new_inode(...) + frozen.snap_ref_count = count_snapshots_covering(parent, name, frozen_fid) + DirTable[parent].replace_name(name, live_fid) + for each snap covering this path: + snap.cow_inodes.insert(frozen_fid) + append EditLog(INODE_COW_SPLIT, frozen_fid, live_fid, snap_ref_count, ...) + +deleteSnapshot(snap_id): + for fid in snap.cow_inodes: + if (--InodeTable[fid].snap_ref_count == 0) + free_inode_and_blockmap(fid) + remove SnapshotRecord + append EditLog(SNAPSHOT_DELETE, snap_id, ...) +``` + +##### 回收规则(「仅属于该快照」的精确定义) + +- **可回收**:`snap_ref_count` 在 `deleteSnapshot` 后变为 **0** 的 inode(表示 **没有任何** 快照再引用该 frozen 版本)。 +- **不可回收**:`snap_ref_count > 0`——即使本次删除的 `s_i` 不再引用,只要还有 `s_j` 引用同一 frozen 副本,就必须保留。 +- **活动 inode**:`snap_ref_count == 0` 为常态;删除快照 **永不** 直接 `free` 当前 live `fid`(除非该 `fid` 本身也是某次 COW 的 frozen 且计数归零)。 + +##### 正确性验证(实现必须覆盖) + +| 检查点 | 要求 | +|--------|------| +| **无双重释放** | 仅当 `snap_ref_count == 0` 入 free 队列;delete/replay 幂等 | +| **无泄漏** | 删除最后一个持有引用快照后,frozen 必入 free;fsck 扫描 `snap_ref_count==0` 且 unreachable | +| **Replay** | `INODE_COW_SPLIT` / `SNAPSHOT_DELETE` 重放后计数与主路径一致 | +| **并发** | COW 与 `deleteSnapshot` 在同 `fid` 或 snap 元数据锁下串行化计数更新 | +| **循环引用** | 命名空间为 **DAG**(父指针单父目录);`InodeRef` 仅 **快照元数据 → inode**,inode **不** 指回 `SnapshotRecord`,图 **无环**。无需通用循环引用检测,但需在 code review / 单元测试中 **断言** 不建立 inode→snapshot 反向边 | + +##### fsck(§8.5 扩展) + +- 对每个 `snap_ref_count > 0` 的 inode:存在至少一条 `SnapshotRecord` / `cow_inodes` 反向引用。 +- 对每个 `SnapshotRecord.cow_inodes` 中的 `fid`:`snap_ref_count >= 1`。 +- 删除快照后的 spot check:`cow_inodes` 中不应出现已 free 的 `fid`。 + +**成熟度说明**:引用计数为业界成熟手段(HDFS snapshot diff、 btrfs 等同类问题),但本实现须在 **COW 分裂计数初值**、**多快照叠加**、**delete + replay** 三条路径上做 **专项测试**(属性测试或模拟并发删除),列入 **P3.1 验收**。 + +#### 6.3.4 与 §6.4 Checkpoint 的边界 + +| | §6.3 用户快照 | §6.4 Checkpoint/FSImage | +|--|----------------|-------------------------| +| 目的 | 时间点恢复、误删回滚、对比历史 | MetaServer **重启/冷备**、缩短 replay | +| 创建成本 | **O(1)** per snap | O(namespace) 后台扫描(可模糊) | +| 读路径 | 快照视图 | 正常命名空间 | +| 存储 | 内存 Ref + 被 COW 分离的 inode | 磁盘 FSImage 文件 | + +两者可同时存在:HDFS 亦区分 **Snapshot** 与 **Checkpoint(FSImage)**。 + +#### 6.3.5 未采纳为用户快照的方案 + +| 方案 | 结论 | +|------|------| +| 一致性点 + 全量遍历生成用户快照 | **否**;移至 §6.4,仅用于 Checkpoint | +| 全局 freeze 瞬时快照 | 阻塞写,不利于 CREATE 目标 | +| 目录级深拷贝百万子项 | O(N) 创建,不可接受 | + +### 6.4 Checkpoint / FSImage(已决:一致性点 + 后台遍历) + +**用途**:周期性 **MetaServer 冷备与启动加速**(类比 HDFS FSImage + edits),**不是** §6.3 的用户快照。 + +**决策**:采用 **一致性点 `N = committed_txn_id` + 后台遍历** 写出模糊 FSImage;恢复时 `load FSImage(N)` + `replay(txn_id > N)`。与 §6.3 HDFS 式快照 **正交**。 + +#### 6.4.1 流程 + +```text +triggerCheckpoint() // 周期或 MetaCheckpoint RPC + ├─ 记录 LAST_TXN_ID = committed_txn_id (N) + ├─ 后台线程遍历 InodeTable、DirTable(§4)、BlockMap(可选) + │ 允许与写并发;图像可「模糊」 + ├─ 写出 FSImage + footer(N) + └─ 原子 publish + +冷启动:load FSImage(N) → replay Edit Log (txn_id > N) → 一致 +``` + +正确性:依赖 §6.7 之「先 Edit Log 再内存 / committed 边界」;模糊项由 replay 修正(同原 §6.3.2 论证)。 + +#### 6.4.2 FSImage 内容与 Large 目录 + +- `section_inodes`、`section_dirs`(Small 桶或嵌入 **`kfstree` checkpoint 流**)。 +- log 截断:**可选**运维操作,非恢复前提。 + +#### 6.4.3 代价 + +快照扫描慢 → `txn_id > N` 的 log 段变长 → **重启 replay 变长**;需控制 checkpoint 周期(配置 `meta.checkpoint.interval` 等)。 + +### 6.5 与 QFS LogWriter / VR 的关系 + +- **可复用**:quorum 复制、block 切分、primary lease、`MetaVrLogSeq` 序语义。 +- **需替换**:`WriteLog` 序列化内容与 replay 解析器(`Replay.cc` / `replay_create` 文本格式 → 二进制 op)。 +- **不再依赖**:`metatree.insert` 作为 redo 单元;redo 单元为 **edit op**。 + +### 6.6 内存修改与 log 的顺序(相对 QFS 的关键改进) + +**提议默认顺序**: + +```text +(分片锁内)改内存 → append 到 log 内存 buffer → 释放锁 +(log 线程)buffer → 复制 → fsync → 推进 committed_txn_id +``` + +对比 QFS:**先 log committed 再 `handle()`**,客户端等待包含「空窗期」内无法从内存读到结果的双重延迟。本 RFC 的可见性边界见 **§6.7**:其他客户端以 **已提交命名空间** 为准;发起方在 RPC 成功后的可见范围与 **lease / sync 策略** 对齐 HDFS 习惯,而非「未提交 txn 全网可见」。 + +### 6.7 读一致性(已决) + +**决策:采用 (c) 跟随 HDFS 风格的 lease + 已提交命名空间模型**,并与 QFS 现有 **primary / VR / chunk lease** 语义衔接(`LEASE_ACQUIRE`、`LEASE_RENEW` 等,见 `MetaRequest`)。 + +| 场景 | 规则 | +|------|------| +| **命名空间变更**(CREATE / REMOVE / RENAME …) | 对其他客户端:仅在 edit **已 committed**(`committed_txn_id` 推进、quorum 复制完成)后可见;primary 内存中未 fsync 的 buffer **不**对外暴露。 | +| **RPC 返回与 durable** | `sync=always`:成功返回 ≡ 命名空间变更已 durable,他客户端可见(在 primary 正常服务前提下)。`sync=batch`:返回表示 **已接受**;他客户端可见时点不早于本批 **组提交 fsync**(类比 HDFS edit 组提交窗口)。 | +| **文件数据读写** | 命名空间登记(create 得 fid)与 **写数据** 分离;已打开文件的读写一致性由 **chunk lease** 保证写者独占/租约续期,读者看到已提交块版本,与 HDFS 「NN 管名字、DN 管块 + lease」分工一致。 | +| **Primary / standby** | 仅 primary 执行 namespace 变更并写 edit;standby 通过 log replay 追赶;客户端 mutating 与强一致命名空间读面向 primary(与现 VR 一致)。 | + +**不采用**: + +- **(a) 仅 primary 本地可见未提交变更**:不足以定义多客户端语义,且与 backup 复制模型冲突。 +- **(b) 未提交 txn 全网可见**:破坏恢复与 fsck 假设,并引入跨客户端脏读。 + +**实现提示**:可在 `Inode` 或目录上保留 `last_committed_txn`;`lookup` / `readdir` 仅暴露 `txn_id ≤ committed_txn_id` 的视图;写路径 lease 逻辑复用现有 QFS 实现,本层不新增第二套租约协议。 + +--- + +## 7. 并发模型 + +### 7.1 锁层次 + +| 资源 | 锁粒度 | +|------|--------| +| `DirIndex` | `hash(parent_fid) % N` 分片锁:**读锁**(lookup/readdir,`PROMOTING` 仍读 Small);**写锁**(create/promotion,写者等待 `PROMOTING` 结束) | +| `InodeTable` | `hash(fid) % M` 分片;读多写少用 RW lock | +| `FidAllocator` | 无锁原子或独立 mutex | +| `EditLog buffer` | 单写者 + MPSC 队列 | +| `PathCache` | RCU 或 per-shard 锁 | + +**禁止**:所有 mutating RPC 共用一个 `submit_request` 全局 mutex(现状瓶颈)。 + +### 7.2 与 B+ 树分片锁的区别 + +对 **全局 `metatree`(单例 B+ 树)**,「按 parent 加锁」**不安全**(不同目录可能 split 同一内部节点,见 `MetaTree-Lock-Optimization.md`)。 +对 **DirTable 分片**:按 `parent_fid` 加锁 **安全**——Small 为独立 `flat_hash`;Large 为 **该目录专属 `Tree` 实例**(仍用 `kfstree`,但不与别目录共享内部节点)。 + +--- + +## 8. RPC 与客户端约定 + +### 8.1 保持兼容的字段 + +- `CREATE`:`P`(parent fid)、`N`(name)、`R`(replicas)等现有 QFS 头。 +- 响应:`H`(新 fid)不变。 + +### 8.2 推荐客户端行为 + +1. **路径与父 fid 缓存(必选)**:在客户端维护 `path → { fid, generation }` 与 `parent_fid`;mutating 成功后更新或按 §4.3 失效;**不要依赖** MetaServer 路径缓存。 +2. **批量 create**:`MULTI_CREATE` 一次 RPC 多条,log 一条 batch op 或连续 append 一次 fsync。 +3. **幂等**:携带 `r`(reqId);服务端 LRU 表 `op_id → result`(TTL 秒级)。 +4. **响应字段**:`LOOKUP` / `CREATE` / `READDIR` 等返回目录 `generation`(或等价 epoch),供客户端校验本地 cache。 + +### 8.3 服务端可删减的 create 工作 + +将 **striping / tier / object-store 判定** 延后到 `SETATTR` 或 **首次 `ALLOCATE`**,使 `CREATE` 保持最小临界区(可选配置开关兼容旧语义)。 + +### 8.4 特殊路径:WORM、dumpster、虚拟 `/proc`(已决) + +对齐现 QFS(`gWormMode`、`DUMPSTERDIR`、`/proc/invalid_chunks`),在 **DirIndex + InodeTable** 模型下的规则如下。 + +#### WORM + +| 项 | 规则 | +|----|------| +| 开关 | 全局 `worm_mode`(等价 `TOGGLE_WORM` RPC),与现网一致。 | +| 拦截层 | **RPC / op 分发层** 统一校验:在 `worm_mode` 下,`REMOVE` / `RENAME` / 覆盖写等 mutating 若目标路径或文件名不满足 `IsWormMutationAllowed`,返回 `-EPERM`。 | +| DirIndex | **不**为 WORM 单独建索引类型;普通 `DirIndex` 操作不变。 | +| 持久化 | `worm_mode` 写入 edit / snapshot 元数据段(或专用 op),恢复后恢复开关状态。 | + +#### dumpster(`/dumpster`) + +| 项 | 规则 | +|----|------| +| 形态 | 根目录下 **普通目录**,启动时 `MKDIR(ROOT, "dumpster")` 得到固定 `dumpster_fid`;在 `Inode.flags` 标记 **`INODE_FLAG_DUMPSTER_ROOT`**(仅根下该目录)。 | +| 用途 | `remove(..., todumpster=true)` 语义为 **rename 到 `dumpster_fid` 下**(与现 `kfsops` 一致),不是额外隐藏表。 | +| 限制(RPC) | 禁止在 dumpster 内 **create/mkdir**;禁止任意 rename **进入或离开** dumpster(`mEnforceDumpsterRulesFlag` 等价配置);禁止删除 dumpster 目录本身。 | +| DirIndex | 与普通目录相同:`DirIndex[dumpster_fid]` 存待清理文件;后台任务对非 busy 文件再 `remove`。 | + +#### `/proc/invalid_chunks`(虚拟路径) + +| 项 | 规则 | +|----|------| +| 形态 | **不进入 `DirIndex`**;无真实 `proc` 目录项。 | +| 解析 | RPC 层(如 `CREATE` / `LOOKUP_PATH` 入口)识别前缀 `/proc/invalid_chunks/`,解析 `chunkId` 后直查 **`BlockMap` / chunk 元数据**,用于诊断日志(对齐现 `MetaCreate::start` 中 `invalChunkFlag` 分支)。 | +| 客户端 | 不应缓存该路径为普通目录;不分配长期 fid。 | + +### 8.5 fsck(已决) + +**不再遍历 B+ 树叶子**;单机全量检查按以下顺序(可 fork 后台进程,对齐现 `MetaFsck` 工具链): + +```text +Phase A — InodeTable + 对每个 fid: + - 类型合法;parent_fid 存在(或为 ROOT) + - 若 type=file:BlockMap 条目可选校验(chunk 副本、版本,委托 LayoutManager 逻辑) + - 标记 abandoned / 零长度策略(沿用现 fsck 配置项) + +Phase B — DirTable(每个目录 fid) + 按 DirNode.layout 枚举: + - **SMALL**:flat_hash 全桶扫描,校验无重复 NameKey、探测链有界 + - **LARGE**:遍历该目录专属 `Tree` 叶(同现 `kfstree` 迭代),校验 `Key(KFS_DENTRY, parent, hash)` 与 name 唯一 + 对每条 DirEntry (name → child_fid): + - InodeTable[child_fid] 存在且 parent_fid == 当前目录 fid + 对 InodeTable 中 type=dir 的项: + - 必须存在 DirTable[dir_fid];`child_count` 与枚举数量一致 + - `state` 不得持久化为 **PROMOTING**;若 checkpoint 遇到则按 §4.2.6 修复或拒绝加载 + - 若 `child_count >= dir_large_threshold` 则 `state` 应为 **LARGE** + +Phase C — 双向一致 + - 无「仅在 DirIndex 出现、Inode 无 parent」的孤儿 + - 无「Inode 有 parent 但父目录 DirIndex 无对应 name」的悬空项 + - dumpster 子项:仅允许 file 类型条目(可选策略检查) + +Phase D — 与 edit committed 视图一致(可选在线 fsck) + 仅扫描 txn_id ≤ committed_txn_id 的视图(§6.7) + +Phase E — 用户快照引用计数(§6.3.6) + - 对每个 SnapshotRecord:cow_inodes 中 fid 存在且 snap_ref_count >= 1 + - 对每个 snap_ref_count > 0 的 inode:至少被一个 SnapshotRecord.cow_inodes 引用 + - 无 snap_ref_count == 0 且仅被快照元数据悬挂的 unreachable frozen +``` + +报告格式可继续兼容现 `MetaFsck` / `kfsfsck` 客户端字段;内部扫描源从 `metatree` 迭代改为 **InodeTable + DirIndex 枚举**。 + +--- + +## 9. 分阶段实施路线图 + +| 阶段 | 交付 | CREATE 预期收益 | +|------|------|-----------------| +| **P0** | 文档 + 基准:分解 QFS create = queue / fsync / btree / mutex | 基线数据 | +| **P1** | Log 组提交 + 缩小 `submit_request` 锁;客户端强制 parent fid | 中(不改索引) | +| **P2** | `DirTable`(§4.2 Small+Large+promotion)+ `InodeTable`;百万级单目录基准 | 高 | +| **P2.1** | §8.4 特殊路径 + §8.5 fsck(含两种 DirNode layout) | 可运维 | +| **P3** | v2 edit + §6.4 Checkpoint(FSImage N + replay)+ 冷启动闭环 | 很高 | +| **P3.1** | §6.3 用户快照 + §6.3.6 `snap_ref_count`(COW/delete/replay/fsck 测试) | 可回滚目录 | + +(**范围外**:多 MetaServer namespace 分片、BlockMap 独立服务、inode 换出等,不列入本 RFC 路线图。) + +--- + +## 10. 开放问题 + +本 RFC 范围内 **无剩余开放项**。已决事项索引: + +| 主题 | 章节 | +|------|------| +| 读一致性 | §6.7 | +| PathCache | §4.3 | +| 单机内存范围 | §2.1 / §2.2 | +| WORM / dumpster / `/proc/invalid_chunks` | §8.4 | +| fsck | §8.5 | +| 大目录索引(Small/Large + promotion) | §4.2 | +| Promotion 原子性与可见性 | §4.2.6 | +| Readdir cookie 逻辑位置 | §5.4.1 | +| 用户快照(HDFS 式 Ref + 文件级 COW) | §6.3 | +| Frozen inode `snap_ref_count` | §6.3.6 | +| Checkpoint/FSImage(一致性点 + 后台遍历) | §6.4 | + +后续若扩展 **多机分片、inode 换出**,另起 RFC。 + +--- + +## 11. 备选方案(已否决或延后) + +| 方案 | 结论 | +|------|------| +| 保留全局 B+ 树,仅优化锁 | 无法消除双 insert 与树分裂;并发上限低(见 `MetaTree-Lock-Optimization.md`) | +| 仅全局 B+ 树 | 已否决;见 §4.2.5 | +| 单目录百万项仍用平铺 HashMap+链表 | **已否决**;首版必须 Large 布局 + promotion | +| 每目录一棵 B+ 树(Large 布局) | **已采纳**,**复用 `kfstree`**,仅用于 `child_count ≥ threshold` 的目录 | +| 自研另一套目录 B-tree 实现 | **已否决**,与现网重复且难保持 checkpoint 一致 | +| 纯 tmpfs、无持久化 | 不符合 QFS 定位 | +| 完全照搬 RocksDB/LSM 存 namespace | 写放大与 create 延迟不如 hash + edit log 直接 | + +--- + +## 12. 成功指标(建议验收) + +在相同硬件与 `sync=batch`(如 1ms 组提交)下,相对当前 QFS main: + +| 指标 | 目标(示例,需基准标定) | +|------|--------------------------| +| 空文件 create p50 | 降低 ≥ 50% | +| 空文件 create p99 | 降低 ≥ 40%(/fsync 尾延迟) | +| create QPS(单 meta,多客户端线程) | 提升 ≥ 3×,且随线程数近线性至磁盘/log 瓶颈 | +| 单目录 10⁶ 子项 lookup p99 | < 50µs 量级(Large 布局,无长链;以基准为准) | +| 单目录 10⁶ 子项 readdir(每页 1k) | 稳定延迟,不随 N 线性恶化 | +| 恢复时间 | `FSImage(N)` + `replay(txn>N)`(§6.4);与用户快照创建 O(1)(§6.3)无关 | +| 创建用户快照 | O(1),与目录子项数无关(§6.3) | + +--- + +## 13. 参考文献(仓库内) + +- `src/cc/meta/kfstree.h` / `kfstree.cc` — B+ 树(Large 目录 **复用** 本实现;全局 `metatree` 不再用于 namespace dentry) +- `src/cc/meta/kfsops.cc` — `Tree::create` / `link` 双 `insert` +- `src/cc/meta/MetaRequest.cc` — `MetaCreate::start` / `handle`,`SubmitBegin` +- `src/cc/meta/LogWriter.cc` — `Enqueue`、`WriteLog`、`fsync` +- `src/cc/meta/NetDispatch.cc` — `submit_request` 全局串行注释 +- `src/cc/meta/MetaRequest.cc` — `gWormMode`、`/proc/invalid_chunks`、`MetaFsck` +- `src/cc/meta/kfsops.cc` — `DUMPSTERDIR`、dumpster rename/remove 规则 +- `MetaTree-Lock-Optimization.md` — B+ 树分片锁不安全分析 +- `wiki/Performance-Comparison-to-HDFS.md` — 历史 metaserver 对比背景 + +--- + +## 修订历史 + +| 版本 | 日期 | 说明 | +|------|------|------| +| 0.1 | 2026-05-25 | 初稿:绿场内存原生元数据层,对照 QFS/HDFS | +| 0.2 | 2026-05-25 | 移除 § 迁移与兼容;明确绿场/冷启动范围,路线图去掉迁移工具 | +| 0.3 | 2026-05-25 | 读一致性决策:§6.6 采用 HDFS 风格 lease + 已提交命名空间 | +| 0.4 | 2026-05-25 | PathCache 决策:§4.3 仅客户端缓存,MetaServer 不维护/复制路径 cache | +| 0.5 | 2026-05-25 | 范围限定单机内存;去掉 namespace 水平分片/换出开放项与 P4 路线图 | +| 0.6 | 2026-05-25 | §8.4 特殊路径、§8.5 fsck 已决;§10 无剩余开放项 | +| 0.7 | 2026-05-25 | §4.2 大目录首版必做:Small flat_hash + Large 每目录 B+ 树 + promotion | +| 0.8 | 2026-05-25 | Large 布局明确复用现 `kfstree`(`Tree`/`Node`/`Key`/`MetaDentry`),不新写 B-tree | +| 0.9 | 2026-05-25 | §6.3 已决:一致性点 + 后台模糊 FSImage + replay(txn>N) | +| 1.0 | 2026-05-25 | §6.3 改为 HDFS 式用户快照(InodeRef+文件级COW);§6.4 为 Checkpoint/FSImage | +| 1.1 | 2026-05-25 | §4.2.6 Promotion:`PROMOTING` 状态、staging、读 Small/写等待、原子发布 | +| 1.2 | 2026-05-25 | §6.3.6:`snap_ref_count`、COW/删快照维护、fsck 与无环不变量 | +| 1.3 | 2026-05-25 | §5.4.1:readdir cookie 用逻辑 key 游标,禁止 LeafIter/节点指针 | From 469b735db745c9758e157950ae84fd426a20bb1b Mon Sep 17 00:00:00 2001 From: zhangzhibiao Date: Mon, 25 May 2026 18:08:50 +0800 Subject: [PATCH 5/7] modify --- .../RFC-0001-memory-native-metadata-layer.md | 68 +++++++++++++++---- 1 file changed, 56 insertions(+), 12 deletions(-) diff --git a/docs/rfc/RFC-0001-memory-native-metadata-layer.md b/docs/rfc/RFC-0001-memory-native-metadata-layer.md index 6074bfe2c..2d86a134f 100644 --- a/docs/rfc/RFC-0001-memory-native-metadata-layer.md +++ b/docs/rfc/RFC-0001-memory-native-metadata-layer.md @@ -278,6 +278,16 @@ promote_small_to_large(parent_fid): **不采用**:晋升过程中对活动 RPC 暴露「部分迁移」的 Large;不采用无 `PROMOTING` 标记、原地边建树边切换 `layout` 字段。 +##### 晋升期间的读性能与写者饥饿(已决) + +- **读者**:`PROMOTING` 期间仍持目录 **读锁** 访问 `body.small`,可与其它 `lookup`/`readdir` **并发**;晋升线程 **只读遍历** Small,不修改 Small。 +- **风险**:触发晋升时 Small 可能已接近阈值(如 **数千~4096** 项),步骤 4 的 `insert` 循环耗时可 **阻塞同目录所有写者**(`create`/`unlink` 等等待 `promote_cv`),极端情况下造成 **写者饥饿**。 +- **决策**:单次 `promote_small_to_large` 须有 **墙上时钟上限**(默认 **`meta.dir.promoteMaxWallMs = 1000`**,可配置): + - 在循环中 **分批** `insert`(如每批 256/512 项)并检查超时; + - **未超时**:正常完成步骤 5–7; + - **超时**:中止本轮晋升 → **回滚 SMALL**(§失败与恢复),返回 `-EBUSY` / 可重试错误;**不**半发布 Large;客户端/写路径 **退避重试** 或稍后由下一次 `create` 再次触发。 +- **观测**:对 `promote_wall_ms`、`promote_aborted_timeout` 打点;P2 验收:4096 项目录晋升 p99 墙钟 **≤ 配置上限**。 + ##### 失败与恢复 - 若步骤 4 失败:`state` 回滚 **SMALL**,丢弃 `staging`,`generation` 不变,唤醒等待者并返回错误。 @@ -418,19 +428,21 @@ CookieSmall = { CookieLarge = { generation: u64 layout: LARGE - after_hash: u64 // 上一页最后一条的 name_hash - after_name: bytes // 上一页最后一条的文件名(字典序续扫) + last_key: NameKey // 上一页最后一条的完整排序键 (name_hash, name) } +// 字段名 after_hash/after_name 仅作实现别名,语义上必须是 NameKey 二元组 ``` +- **排序键**:与 §4.2.1 `NameKey` 一致;`kfstree` 叶序为 **先 `name_hash` 再 `name` 字典序**(同 `MetaDentry::matchSelf`)。单目录内 **不可能** 存在两个相同 `name`,但续扫仍须用 **`(hash, name)` 对**,不能仅用 `name`(不同 hash 桶下仅比 name 会错位)。 +- **禁止**:cookie 仅编码 `name` 字符串而省略 `name_hash`。 + 恢复算法: ```text readdir_resume(parent, cookie): if cookie.generation != DirNode.generation: INVALID - key = Key(KFS_DENTRY, parent, cookie.after_hash) - it = lowerBound(tree, key) // 现 kfstree - skip entries where (hash,name) <= cookie.after_name lexicographically + it = lowerBound(tree, Key(KFS_DENTRY, parent, cookie.last_key.name_hash)) + skip entries where NameKey(hash,name) <= cookie.last_key lexicographically return next max_entries from it (LeafIter 仅作实现手段,不写入 cookie) ``` @@ -446,7 +458,7 @@ CookieLargeAlt = { generation, leaf_node_id, index_in_leaf } - `leaf_node_id` 为 **分配的稳定叶标识**(split 时子叶继承/拆分规则须在 RFC 实现细则中定义),**不是**运行时指针。 - 恢复时若 `leaf_node_id` 已合并/分裂:**从该 id 映射节点的最小 key**,或 **`lowerBound(该 key)` 的下一个有效叶** 继续,**宁可少量重复不可漏**(与建议一致)。 -- 首版 **优先 `after_name` 游标**;`leaf_node_id` 方案可在性能优化阶段引入。 +- 首版 **优先 `last_key`(NameKey)游标**;`leaf_node_id` 方案可在性能优化阶段引入。 ##### 与 promotion / mutation 的交互 @@ -454,7 +466,7 @@ CookieLargeAlt = { generation, leaf_node_id, index_in_leaf } |------|-------------| | **promotion 完成** | `generation++`;Small cookie **失效**;客户端用空 cookie 对 Large 重扫 | | **rename/unlink/rmdir(目录)** | `generation++`;所有 cookie 失效 | -| **Large 上 create/delete** | `generation` 可不变;**`after_name` cookie 仍有效**(靠 key 重定位);若产品要求列举快照视图,另议 | +| **Large 上 create/delete** | `generation` 可不变;**`last_key` cookie 仍有效**(靠 `NameKey` 重定位);若产品要求列举快照视图,另议 | | **返回 `-EBADF`/`EINVAL`** | 客户端 **丢弃 cookie,从空重新开始** | ##### RPC 响应 @@ -575,9 +587,9 @@ InodeRef { target_fid, txn_id_cap } cow_split(file_fid, parent, name): frozen_fid = retain_or_clone_inode(file_fid) // 旧版本留给快照 live_fid = allocate_new_inode(...) - frozen.snap_ref_count = count_snapshots_covering(parent, name, frozen_fid) + frozen.snap_ref_count = snapshot_ref_index.count(frozen_fid) // 见下,禁止仅靠运行时全表扫描 DirTable[parent].replace_name(name, live_fid) - for each snap covering this path: + for each snap in snapshot_ref_index.ref_snapshots(frozen_fid): snap.cow_inodes.insert(frozen_fid) append EditLog(INODE_COW_SPLIT, frozen_fid, live_fid, snap_ref_count, ...) @@ -611,7 +623,22 @@ deleteSnapshot(snap_id): - 对每个 `SnapshotRecord.cow_inodes` 中的 `fid`:`snap_ref_count >= 1`。 - 删除快照后的 spot check:`cow_inodes` 中不应出现已 free 的 `fid`。 -**成熟度说明**:引用计数为业界成熟手段(HDFS snapshot diff、 btrfs 等同类问题),但本实现须在 **COW 分裂计数初值**、**多快照叠加**、**delete + replay** 三条路径上做 **专项测试**(属性测试或模拟并发删除),列入 **P3.1 验收**。 +##### `count_snapshots_covering` 与倒排索引(已决) + +**问题**:`frozen.snap_ref_count = count_snapshots_covering(...)` 若在 COW 时 **扫描全部 SnapshotRecord** 或沿路径动态枚举,易错且 O(快照数);多快照引用同一 frozen inode 时 **跨快照累计** 必须精确。 + +**决策**(二选一,首版至少实现其一): + +| 方案 | 做法 | +|------|------| +| **A. 倒排索引(推荐)** | 维护 `SnapshotRefIndex: frozen_fid → { snap_id... }`(及可选 `(parent,name) → frozen_fid`)。`createSnapshot`:对仍与 live 共用的路径 **不** 预遍历;**COW 时** 将 `frozen_fid` 登记到 **当前所有覆盖该 `(parent,name)` 的活跃快照**(由 snap 链/目录 Ref 解析一次,写入索引)。`deleteSnapshot`:对 `cow_inodes` 中每个 `fid` 从索引移除 `snap_id`,再 `--snap_ref_count`。 | +| **B. 快照创建时预计算** | 在 `createSnapshot` O(1) 元数据之外,记录「该快照可见的 (parent,name)→fid 视图版本」;首次 COW 时用 **快照差分元数据** 得到 `K`,写入 `snap_ref_count` 与 `cow_inodes`。 | + +- **禁止**:`deleteSnapshot` 或 replay 时依赖 **未持久化的** 临时扫描结果且与主路径不一致。 +- **再建快照 `s3`(file1 已 frozen)**:`SnapshotRefIndex` 对 `frozen_fid` **insert(s3)** 并 `snap_ref_count++`(与上表「再建快照」行一致)。 +- **fsck**:`snap_ref_count == |SnapshotRefIndex[fid]|`(允许索引与 `cow_inodes` 并集交叉校验)。 + +**成熟度说明**:引用计数为业界成熟手段,但须在 **COW 初值 / 多快照叠加 / delete + replay / 索引一致性** 上做 **专项测试**,列入 **P3.1 验收**。 #### 6.3.4 与 §6.4 Checkpoint 的边界 @@ -662,6 +689,20 @@ triggerCheckpoint() // 周期或 MetaCheckpoint RPC 快照扫描慢 → `txn_id > N` 的 log 段变长 → **重启 replay 变长**;需控制 checkpoint 周期(配置 `meta.checkpoint.interval` 等)。 +#### 6.4.4 扫描期内存压力(已决) + +**问题**:后台遍历 `InodeTable`、`DirTable`(含 Large 目录 `kfstree` 流式导出)、`BlockMap` 时,若 **每 inode/每目录项分配独立序列化 buffer**,峰值内存可与 **瞬时分配速率 × 对象数** 成正比,挤压热路径 RSS。 + +**决策**: + +| 措施 | 说明 | +|------|------| +| **Buffer 池** | 后台线程 **复用** 固定大小写缓冲(如 1–4 MiB),`section_*` 写满再 flush 到 FSImage 文件,避免 per-object `malloc` | +| **扫描节流** | `meta.checkpoint.maxEntriesPerTick` / `maxBytesPerTick` 限制每时间片处理条数;`yield` 或短 sleep,避免与 mutating 抢满 CPU | +| **Large 目录** | 按 `LeafIter` **流式** 写出 checkpoint 记录,**禁止** 先将百万 `MetaDentry` 载入单一 `vector` | +| **背压** | 若 FSImage 写盘慢于扫描,队列深度有界;超限则 **拉长 checkpoint 周期** 而非无界堆内存 | +| **可观测** | `checkpoint_scan_rss_delta`、`checkpoint_buffer_pool_bytes` 指标;压测:全量 namespace 扫描期间 CREATE p99 退化 **≤ 约定比例**(如 20%,P3 验收) | + ### 6.5 与 QFS LogWriter / VR 的关系 - **可复用**:quorum 复制、block 切分、primary lease、`MetaVrLogSeq` 序语义。 @@ -836,10 +877,12 @@ Phase E — 用户快照引用计数(§6.3.6) | fsck | §8.5 | | 大目录索引(Small/Large + promotion) | §4.2 | | Promotion 原子性与可见性 | §4.2.6 | -| Readdir cookie 逻辑位置 | §5.4.1 | +| Promotion 墙钟上限(写者饥饿) | §4.2.6 | +| Readdir cookie 逻辑位置(`NameKey`) | §5.4.1 | | 用户快照(HDFS 式 Ref + 文件级 COW) | §6.3 | -| Frozen inode `snap_ref_count` | §6.3.6 | +| Frozen inode `snap_ref_count` + 倒排索引 | §6.3.6 | | Checkpoint/FSImage(一致性点 + 后台遍历) | §6.4 | +| Checkpoint 扫描内存与节流 | §6.4.4 | 后续若扩展 **多机分片、inode 换出**,另起 RFC。 @@ -906,3 +949,4 @@ Phase E — 用户快照引用计数(§6.3.6) | 1.1 | 2026-05-25 | §4.2.6 Promotion:`PROMOTING` 状态、staging、读 Small/写等待、原子发布 | | 1.2 | 2026-05-25 | §6.3.6:`snap_ref_count`、COW/删快照维护、fsck 与无环不变量 | | 1.3 | 2026-05-25 | §5.4.1:readdir cookie 用逻辑 key 游标,禁止 LeafIter/节点指针 | +| 1.4 | 2026-05-25 | §4.2.6 晋升墙钟上限;§5.4.1 `last_key`;§6.3.6 倒排索引;§6.4.4 checkpoint 内存 | From c3b38a24d02bf8936cf3c49c524dd1cfded56761 Mon Sep 17 00:00:00 2001 From: zhangzhibiao Date: Mon, 25 May 2026 19:59:31 +0800 Subject: [PATCH 6/7] update doc --- .../RFC-0001-memory-native-metadata-layer.md | 213 ++++++++++++------ 1 file changed, 143 insertions(+), 70 deletions(-) diff --git a/docs/rfc/RFC-0001-memory-native-metadata-layer.md b/docs/rfc/RFC-0001-memory-native-metadata-layer.md index 2d86a134f..377191223 100644 --- a/docs/rfc/RFC-0001-memory-native-metadata-layer.md +++ b/docs/rfc/RFC-0001-memory-native-metadata-layer.md @@ -105,7 +105,7 @@ QFS **并非不能** 采用 HDFS NN 式布局;当前选择是 **统一 B+ 树 ┌───────────────────────────▼─────────────────────────────┐ │ 持久化层 │ │ • Edit Log(二进制 op,组提交 fsync) │ -│ • 用户快照:HDFS 式引用 + 文件级 COW(§6.3) │ +│ • 用户快照:Ref + COW + 目录 diff(§6.3) │ │ • Checkpoint/FSImage:一致性点 N + 后台遍历(§6.4) │ │ • Quorum / VR 复制(复用现有 LogWriter/VR 基础设施) │ └─────────────────────────────────────────────────────────┘ @@ -115,11 +115,11 @@ QFS **并非不能** 采用 HDFS NN 式布局;当前选择是 **统一 B+ 树 | 维度 | QFS 现状 | 本 RFC | |------|----------|--------| -| 主索引 | 全局 B+ 树,dentry/fattr 不同 key | DirNode(Small hash / Large **复用 `kfstree` 每目录一棵 `Tree`**)+ InodeTable | +| 主索引 | 全局 B+ 树,dentry/fattr 不同 key | DirNode(Small hash / Large **抽取/适配 `kfstree` 节点算法的目录局部 B+ 树**)+ InodeTable | | CREATE 索引操作 | 2× 全局 `insert` + 多次 `findLeaf` | 1× DirNode insert + 1× InodeTable insert | | 百万级单目录 | 同全局树叶子链/同桶冲突风险 | Large 布局 O(log N),首版必做 promotion | -| 持久化顺序 | 先 WAL committed,再 `handle()` | 临界区内改内存 + append log buffer;fsync 摊销 | -| 用户快照 | (现 QFS 无同等机制) | `InodeRef` + 文件级 COW,创建 O(1)(§6.3) | +| 持久化顺序 | 先 WAL committed,再 `handle()` | 临界区内写入 **pending 版本** + append log buffer;commit 后进入对外可见视图 | +| 用户快照 | (现 QFS 无同等机制) | `InodeRef` + 文件级 COW + 目录 diff,创建 O(1)(§6.3) | | checkpoint | B+ 树页/节点序列化 | 模糊 FSImage(§6.4)+ replay txn>N | | chunk 元数据 | 同树 `KFS_CHUNKINFO` | BlockMap 分离,allocate 时再写 | @@ -137,7 +137,7 @@ Inode { mode, uid, gid, size, mtime, ctime, atime nlink, flags // 见 §8.4:WORM、dumpster 子树、striping 等 snapshottable: bool // 目录可打快照(§6.3) - snap_ref_count: u32 // 被用户快照持有的 frozen 引用数(§6.3.6);live inode 常为 0 + snap_ref_count: u32 // 被用户快照持有的 frozen 引用数(§6.3.4);live inode 常为 0 replication | ec_policy // 或仅指针,详细布局在 allocate 时设置 dir_child_count // 仅目录;用于 readdir 分页提示 generation: u64 // 每次 rename/unlink/rmdir/promotion 递增,供 cache 失效 @@ -176,12 +176,12 @@ DirNode = { #### 4.2.3 Large 布局(子项数 ≥ 阈值,或 Small 无法安全插入) -**决策:Large 布局直接复用当前 QFS B+ 树实现**(`kfstree.h` / `kfstree.cc`),不新写一套目录 B-tree。与全局 `metatree` 的差异仅是 **每目录一棵独立 `Tree` 实例**,键空间 scoped 在该 `parent_fid` 下。 +**决策:Large 布局复用当前 QFS B+ 树的节点/Key/迭代算法**(`kfstree.h` / `kfstree.cc`),但实现上需要抽取或适配为 **目录局部 B+ 树组件**,而不是把现有 `Tree` 类原封不动实例化。现有 `Tree` 仍带有全局 namespace、checkpoint、dumpster、path cache 等语义;LargeDir 只需要其中的有序 dentry 索引能力。 | 复用组件 | 路径 / 说明 | |----------|-------------| | 内部节点 | `Node`(`NKEY=170`,4096B 页式节点,`findplace` 二分,`split` / `merge`) | -| 树操作 | `Tree::insert`、`Tree::del`、`lowerBound` / `findLeaf`、`LeafIter` | +| 树操作 | 复用/抽取 `insert`、`del`、`lowerBound` / `findLeaf`、`LeafIter` 的节点算法 | | 键 | 现有 `Key` / `PartialMatch`;叶键 **`Key(KFS_DENTRY, parent_fid, name_hash)`**,与现 `MetaDentry::keySelf()` 一致 | | 叶记录 | `MetaDentry`(或薄封装 `DirBTreeLeaf` 内嵌相同字段);`matchSelf` 比对 `name` | | 内存 | `MetaNode::allocate` / `PoolAllocator`(与现 meta 节点相同) | @@ -189,17 +189,17 @@ DirNode = { ```text LargeDir { parent_fid: fid_t - tree: Tree // 现 kfstree.Tree,非全局 metatree 单例 + tree: DirBTree // 从 kfstree 节点算法抽取/适配,非全局 metatree 单例 } ``` - **语义**:逻辑上仍是「该目录下 name → child_fid」;物理上用 **一棵子树** 存该目录全部 `MetaDentry` 叶,**不再**插入全局 `metatree` 的混合 key 空间。 -- **`lookup` / `create`**:对该目录的 `Tree` 调用与现 `getDentry` / `insert` 相同逻辑(`findLeaf` + 叶链 `peer()` 扫同名 hash),**O(log N)**,无百万长链。 +- **`lookup` / `create`**:对该目录的 `DirBTree` 调用与现 `getDentry` / `insert` 相同逻辑(`findLeaf` + 叶链 `peer()` 扫同名 hash),**O(log N)**,无百万长链。 - **`readdir`**:`LeafIter` 逻辑序遍历 + §5.4 **逻辑位置 cookie**(禁止裸指针)。 - **升格(promotion)**:原子性与并发语义见 **§4.2.6**(`PROMOTING` 状态、写阻塞、读仍用 Small、staging 完成后一次性切换)。 -- **checkpoint / fsck**:Large 目录序列化可 **复用现 Node/Meta checkpoint 格式**;fsck 见 §8.5(`PROMOTING` 视为 transient,持久化快照中不应出现)。 +- **checkpoint / fsck**:Large 目录可复用现 Node/Meta 的 **记录编码思路**,但需新增 `section_dirs.large` 外层元数据(`parent_fid`、layout、generation、child_count、记录数/校验和),不能直接把全局 `metatree` checkpoint 流嵌入;fsck 见 §8.5(`PROMOTING` 视为 transient,持久化快照中不应出现)。 -**不新写**:单独的目录 B-tree 节点类型、另一套 split/merge 或不同于 `Node` 页大小的树实现。 +**不重复造轮子**:split/merge、节点页大小、Key 排序、LeafIter 语义应尽量沿用 `kfstree`;但需要把全局 `Tree` 的非目录职责剥离出去。 #### 4.2.4 复杂度与验收(百万级单目录) @@ -216,15 +216,15 @@ LargeDir { | | 全局 `metatree`(现 QFS) | Large `DirNode`(本 RFC) | |--|---------------------------|---------------------------| | 代码 | `kfstree` | **同一套** `kfstree` | -| 实例 | 单例 `metatree`,混放 dentry/fattr/chunk | **每超大目录一个 `Tree`** | +| 实例 | 单例 `metatree`,混放 dentry/fattr/chunk | **每超大目录一个 `DirBTree`** | | create 副作用 | 可能 split 共享祖先内部节点 | 仅影响该目录子树 | | 小目录 | 也走全局树 | **Small `flat_hash`**,不进 B+ 树 | -InodeTable、BlockMap **不再**进入任何 B+ 树;仅 **超大目录的子项列表** 使用 `Tree` 存 `MetaDentry` 叶。 +InodeTable、BlockMap **不再**进入任何 B+ 树;仅 **超大目录的子项列表** 使用 `DirBTree` 存 `MetaDentry` 叶。 #### 4.2.6 晋升(Promotion)的原子性与可见性(已决) -**问题**:§4.2.3 若在「半建成」的 Large `Tree` 上并发 `lookup`/`create`,可能看到 **不完整** 的 B+ 树或 Small/Large 双写混乱。 +**问题**:§4.2.3 若在「半建成」的 Large `DirBTree` 上并发 `lookup`/`create`,可能看到 **不完整** 的 B+ 树或 Small/Large 双写混乱。 **决策**:`DirNode` 增加 **`state`**;晋升在 **staging** 中构建 Large,通过 **一次性发布** 切换;晋升期间 **读走 Small、写阻塞或排队**。 @@ -250,7 +250,7 @@ promote_small_to_large(parent_fid): 1. assert(state == SMALL) 2. state = PROMOTING - 3. staging.large = new Tree() // 读者不可见 + 3. staging.large = new DirBTree() // 读者不可见 4. for entry in body.small: // 只读 Small,不改 Small staging.large.insert(MetaDentry(...)) 5. // 一次性发布(原子切换可见布局) @@ -351,24 +351,30 @@ BlockMap : 按 fid 分片 1. shard = hash(parent_fid) % N_SHARDS 2. lock(DirShard[shard]) 3. if DirIndex[parent].contains(name) → 处理 exclusive / truncate 语义 -4. new_fid = FidAllocator.next() -5. DirTable[parent].insert(name, new_fid) // §4.2,必要时 promotion -6. InodeTable[new_fid] = Inode{ parent, attrs... } -7. update parent.mtime, parent.file_count -8. txn = EditLog.append(CREATE, parent, name, new_fid, attrs, op_id) -9. unlock -10. if sync_policy == always: wait(txn.committed) -11. return new_fid +4. txn_id = EditLog.reserve_txn() +5. new_fid = FidAllocator.next() +6. DirTable[parent].insert_pending(name, new_fid, create_txn=txn_id) // §4.2,必要时 promotion +7. InodeTable[new_fid] = Inode{ parent, attrs..., create_txn=txn_id, delete_txn=none, pending=true } +8. update parent pending mtime / child_count version +9. EditLog.append_buffer(txn_id, CREATE, parent, name, new_fid, attrs, op_id) +10. unlock +11. if sync_policy == always: wait(txn_id.committed) +12. return { new_fid, txn_id } ``` **树操作次数**:0。持久化:1 条 edit(组提交时与其他 op 共享一次 fsync)。 +**可见性要求**:步骤 6–9 修改的是 **pending 版本**。普通 `LOOKUP`/`READDIR` 只暴露 `txn_id <= committed_txn_id` 的版本;同一客户端是否可读到自己的 pending create 由会话级 read-your-writes 选项单独定义,默认不向其它客户端暴露未提交 txn。 + ### 5.2 LOOKUP(单级) ```text lock(DirShard[hash(parent)]) - entry = DirTable[parent].lookup(name) + entry = DirTable[parent].lookup_committed(name, committed_txn_id) + if entry == null: return ENOENT fa = InodeTable[entry.fid] + if fa.create_txn > committed_txn_id: return ENOENT + if fa.delete_txn != none and fa.delete_txn <= committed_txn_id: return ENOENT unlock → 权限检查 ``` @@ -386,8 +392,8 @@ readdir(parent, cookie, max_entries) → 分页返回 DirEntry 列表 | `DirNode.state` | 遍历方式 | cookie 概要 | |-----------------|----------|-------------| -| **SMALL** | 桶序 + 桶内序 | 逻辑位置(§5.4.1) | -| **PROMOTING** | 仍按 Small | 同 SMALL;`generation` 未变 | +| **SMALL** | 按 `NameKey` 逻辑序(Small 有界,必要时临时排序) | 逻辑 key 游标(§5.4.1) | +| **PROMOTING** | 仍按 Small 的 `NameKey` 逻辑序 | 同 SMALL;promotion 完成后 `generation++` | | **LARGE** | B+ 树 key 序(`kfstree`) | **逻辑 key 游标**,禁止节点指针 | - 每次 RPC 仅返回 **≤ max_entries**(默认上限如 1024,可配置)。 @@ -413,12 +419,13 @@ readdir(parent, cookie, max_entries) → 分页返回 DirEntry 列表 CookieSmall = { generation: u64 layout: SMALL | PROMOTING - bucket_id: u32 // 开放寻址桶序号(稳定枚举顺序) - slot: u32 // 桶内下一起始槽位 + last_key: NameKey? // 上一页最后一条;空表示从头 } ``` -- 仅在 **同一 `generation`、同一 Small 布局** 下有效;**promotion 完成** 后 `generation++`,旧 cookie **作废**(切换为 Large cookie 或从头)。 +- Small 不把开放寻址的 `bucket_id`/`slot` 暴露给 cookie;rehash、删除后的 tombstone 清理、Robin Hood 位移都会改变物理桶位置。 +- `readdir` 对 Small 使用 `NameKey` 逻辑序重定位;Small 有阈值上限(默认 4096),可在每页临时收集并排序,或维护有序 side index。 +- Small 上任意 `create`/`delete`/rehash 必须 `generation++`,旧 cookie 返回 `EINVAL` 并要求客户端重扫;promotion 完成同样 `generation++`,旧 Small cookie 作废。 ##### LARGE(推荐:逻辑 key 游标) @@ -465,6 +472,7 @@ CookieLargeAlt = { generation, leaf_node_id, index_in_leaf } | 事件 | cookie 行为 | |------|-------------| | **promotion 完成** | `generation++`;Small cookie **失效**;客户端用空 cookie 对 Large 重扫 | +| **Small 上 create/delete/rehash** | `generation++`;Small cookie **失效**,避免开放寻址物理位置变化造成漏扫/重复 | | **rename/unlink/rmdir(目录)** | `generation++`;所有 cookie 失效 | | **Large 上 create/delete** | `generation` 可不变;**`last_key` cookie 仍有效**(靠 `NameKey` 重定位);若产品要求列举快照视图,另议 | | **返回 `-EBADF`/`EINVAL`** | 客户端 **丢弃 cookie,从空重新开始** | @@ -507,11 +515,11 @@ checksum per block / per record **Log 线程模型**:单写者 append + fsync;namespace 分片锁与 log 锁分离,缩短临界区。 -### 6.3 用户快照(已决:HDFS 式引用 + 文件级 COW) +### 6.3 用户快照(已决:HDFS 式引用 + 文件级 COW + 目录 Diff) **放置说明**:本节描述 **Snapshottable 目录上的用户可见快照**(类比 HDFS `createSnapshot`),与 §6.4 **周期性 Checkpoint/FSImage**(NN 冷备)分工不同。实现可落在 **P3/P3.1**(§9)。 -**决策:创建快照采用 HDFS 核心思路——引用(Rename/Reference)而非复制;修改采用文件级写时复制(COW)。** 不采用对整棵树做全量内存扫描来「创建」用户快照(该做法保留给 §6.4 Checkpoint)。 +**决策:创建快照采用 HDFS 核心思路——引用(Rename/Reference)而非复制;文件内容修改采用文件级写时复制(COW);目录项变化必须记录目录 diff。** 不采用对整棵树做全量内存扫描来「创建」用户快照(该做法保留给 §6.4 Checkpoint)。 #### 6.3.1 核心机制(对齐 HDFS) @@ -541,14 +549,38 @@ mutate(file1): ``` - **目录级百万文件**:创建 `s1` **不遍历** `DirTable`;仅在被修改的单个文件上支付 COW(约一次 create + 后续 write 的元数据开销)。 -- **Large 目录**:COW 只 **`replace_name` 一条 DirEntry**(Small 或 `kfstree` 单键更新),不重扫整棵 per-dir `Tree`。 +- **Large 目录**:COW 只 **`replace_name` 一条 DirEntry**(Small 或 `DirBTree` 单键更新),不重扫整棵 per-dir `DirBTree`。 + +#### 6.3.1.1 目录 Diff(必需) + +仅有 `InodeRef + 文件级 COW` **不足以**提供用户快照的时间点语义:快照创建后,live 目录里的 `create`、`unlink`、`rename` 若直接修改 `DirTable`,快照读会跟着变化。首版快照必须同时实现 **目录级 diff**(对齐 HDFS snapshot diff 思路),记录快照创建点之后每个 snapshottable 子树内的目录项变化。 + +```text +DirSnapshotDiff { + dir_fid, snap_id, base_txn + created: set // 快照之后新建,快照视图不可见 + deleted: map // 快照之后删除/rename out,快照视图仍可见 + renamed: optional oldName -> newName // 可展开为 deleted+created +} +``` + +规则: + +- `createSnapshot(D)`:只创建根 `SnapshotRecord`,不遍历百万子项;目录 diff 延迟到后续 mutation 时按需创建。 +- `create(parent, name)`:若 parent 被某个活跃快照覆盖,在对应 `DirSnapshotDiff.created` 记录 `name`,使该快照视图过滤掉新名字。 +- `unlink/rename out(parent, name)`:若被快照覆盖,先冻结当前 `child_fid`(文件按 §6.3.4;目录需冻结目录引用和后续 diff 链),在 `deleted[name]` 记录 frozen 引用,快照视图继续返回旧条目。 +- `rename across dirs`:按源目录 `deleted` + 目标目录 `created` 处理;必须与 §7.3 锁顺序一致。 +- `readdir(snapshot)`:以 live DirIndex 为基底叠加 diff:过滤 `created`,补回 `deleted`,并按 `NameKey` 逻辑序输出;Large 目录仍使用 `DirBTree` lowerBound,再 merge diff 项。 + +没有目录 diff 时,§6.3 的用户快照只能算 inode 引用缓存,不能作为可恢复的目录快照交付。 #### 6.3.2 性能预期(与 HDFS 对照) | 场景 | 性能 | 原因 | |------|------|------| | 读活动/读快照文件 | 快照读无额外锁;活动读与无快照相同 | Ref 只读解析 | -| 创建 / 删除快照 | **近似 O(1)** | 仅增删 `SnapshotRecord` / `InodeRef` | +| 创建快照 | **O(1)** | 仅新增 `SnapshotRecord` / `InodeRef`,不遍历子树 | +| 删除快照 | O(本快照登记的 frozen/diff 项) | 释放 `cow_inodes`、`dir_diffs.deleted` 与倒排索引引用 | | 首次修改快照覆盖下的文件 | 有开销(COW 一个 inode) | 与被修改文件数成正比,与目录总规模无关 | | 再次修改已 COW 过的活动文件 | 与无快照相同 | 已操作活动侧新 inode | @@ -558,16 +590,17 @@ mutate(file1): SnapshotRecord { snap_id, name, root_dir_fid, txn_id_at_create: N root_ref: InodeRef // O(1) 创建:指向 snapshottable 根目录 inode - cow_inodes: set // 可选:本快照触发的 frozen fid 登记,便于 delete 时递减 + cow_inodes: set // 本快照引用的 frozen fid,便于 delete 时递减 + dir_diffs: map // 本快照目录项变化 } InodeRef { target_fid, txn_id_cap } ``` -- **`snap_ref_count`** 定义在 §4.1 `Inode` 上:表示有多少 **独立快照引用** 仍依赖该 **inode 对象**(通常为 COW 后的 **frozen** 副本;活动/live inode 在分裂后一般为 0)。 -- Edit log:`SNAPSHOT_CREATE`、`SNAPSHOT_DELETE`、`INODE_COW_SPLIT`(含 `frozen_fid`、`live_fid`、`snap_ref_delta`),供 standby **确定性 replay**。 +- **`snap_ref_count`** 定义在 §4.1 `Inode` 上:表示有多少 **独立快照引用** 仍依赖该 **inode 对象**(通常为 COW 后的 **frozen** 副本;活动/live inode 在分裂后一般为 0)。目录项时间点语义由 §6.3.1.1 的 `DirSnapshotDiff` 维护,inode 引用计数只解决 frozen inode 生命周期。 +- Edit log:`SNAPSHOT_CREATE`、`SNAPSHOT_DELETE`、`INODE_COW_SPLIT`(含 `frozen_fid`、`live_fid`、`snap_ref_delta`)、`DIR_SNAPSHOT_DIFF_UPDATE`,供 standby **确定性 replay**。 -#### 6.3.6 Frozen inode 引用计数(已决) +#### 6.3.4 Frozen inode 引用计数(已决) **问题**:§6.3 删除快照时「仅回收本快照专属的 frozen inode」。若同一 frozen inode 被 **多个快照** 引用(例如 `/foo` 上连续创建 `s1`、`s2` 后才首次修改 `file1`),**不能在 `snap_ref_count > 0` 时释放**。 @@ -580,7 +613,7 @@ InodeRef { target_fid, txn_id_cap } | **`createSnapshot`** | 根目录 `root_ref.target` **+1**(可选) | 创建本身 O(1);**不**遍历子树给每个文件 +1。未 COW 的文件仍与 live 共用同一 `fid`,读快照走解析路径。 | | **首次 `COW_SPLIT`(file1)** | 对 **frozen_fid**(旧 inode 副本)设为 **覆盖该文件的所有活跃快照数** `K` | 例:存在 `s1`、`s2` 均可见 `file1` 时尚未修改 → `frozen.snap_ref_count = 2`。活动侧新 `live_fid`:`snap_ref_count = 0`。 | | **再建快照 `s3`(已有 frozen file1)** | 若 `s3` 仍指向含 `file1` 的视图且 `file1` 已 frozen:对 `frozen_fid` **+1** | 仅影响 **已分裂** 的 frozen 对象;仍与 live 共用的路径在首次 COW 时一次性结算。 | -| **`deleteSnapshot(s)`** | 对该快照登记过的每个 `frozen_fid`:**-1** | 来自 `cow_inodes` 或快照元数据索引;**仅当减到 0** 时 `free_inode(frozen_fid)` + 释放 BlockMap | +| **`deleteSnapshot(s)`** | 对该快照登记过的每个 `frozen_fid`:**-1** | 来自 `cow_inodes`、`dir_diffs.deleted` 或快照元数据索引;**仅当减到 0** 时 `free_inode(frozen_fid)` + 释放 BlockMap | | **活动路径修改 live inode** | 不增减 | live 与快照引用解耦 | ```text @@ -595,8 +628,13 @@ cow_split(file_fid, parent, name): deleteSnapshot(snap_id): for fid in snap.cow_inodes: - if (--InodeTable[fid].snap_ref_count == 0) + if snapshot_ref_index.remove(fid, snap_id) and --InodeTable[fid].snap_ref_count == 0 free_inode_and_blockmap(fid) + for diff in snap.dir_diffs: + for fid in diff.deleted.values: + if snapshot_ref_index.remove(fid, snap_id) and --InodeTable[fid].snap_ref_count == 0 + free_inode_and_blockmap(fid) + release snap.dir_diffs remove SnapshotRecord append EditLog(SNAPSHOT_DELETE, snap_id, ...) ``` @@ -619,9 +657,11 @@ deleteSnapshot(snap_id): ##### fsck(§8.5 扩展) -- 对每个 `snap_ref_count > 0` 的 inode:存在至少一条 `SnapshotRecord` / `cow_inodes` 反向引用。 +- 对每个 `snap_ref_count > 0` 的 inode:存在至少一条 `SnapshotRecord` / `cow_inodes` / `dir_diffs.deleted` / `SnapshotRefIndex` 反向引用。 - 对每个 `SnapshotRecord.cow_inodes` 中的 `fid`:`snap_ref_count >= 1`。 -- 删除快照后的 spot check:`cow_inodes` 中不应出现已 free 的 `fid`。 +- 对每个 `DirSnapshotDiff.deleted` 中的 `frozen_fid`:inode 存在,`snap_ref_count >= 1`,且 `SnapshotRefIndex[frozen_fid]` 包含该 `snap_id`。 +- `snap_ref_count == |SnapshotRefIndex[fid]|`;允许再与所有 `cow_inodes`、`dir_diffs.deleted` 的并集交叉校验。 +- 删除快照后的 spot check:`cow_inodes` / `dir_diffs.deleted` 中不应出现已 free 的 `fid`。 ##### `count_snapshots_covering` 与倒排索引(已决) @@ -631,27 +671,27 @@ deleteSnapshot(snap_id): | 方案 | 做法 | |------|------| -| **A. 倒排索引(推荐)** | 维护 `SnapshotRefIndex: frozen_fid → { snap_id... }`(及可选 `(parent,name) → frozen_fid`)。`createSnapshot`:对仍与 live 共用的路径 **不** 预遍历;**COW 时** 将 `frozen_fid` 登记到 **当前所有覆盖该 `(parent,name)` 的活跃快照**(由 snap 链/目录 Ref 解析一次,写入索引)。`deleteSnapshot`:对 `cow_inodes` 中每个 `fid` 从索引移除 `snap_id`,再 `--snap_ref_count`。 | +| **A. 倒排索引(推荐)** | 维护 `SnapshotRefIndex: frozen_fid → { snap_id... }`(及可选 `(parent,name) → frozen_fid`)。`createSnapshot`:对仍与 live 共用的路径 **不** 预遍历;**COW / 目录 diff 产生 frozen 引用时** 将 `frozen_fid` 登记到 **当前所有覆盖该 `(parent,name)` 的活跃快照**(由 snap 链/目录 Ref 解析一次,写入索引)。`deleteSnapshot`:对 `cow_inodes` 与 `dir_diffs.deleted` 中每个 `fid` 从索引移除 `snap_id`,再 `--snap_ref_count`。 | | **B. 快照创建时预计算** | 在 `createSnapshot` O(1) 元数据之外,记录「该快照可见的 (parent,name)→fid 视图版本」;首次 COW 时用 **快照差分元数据** 得到 `K`,写入 `snap_ref_count` 与 `cow_inodes`。 | - **禁止**:`deleteSnapshot` 或 replay 时依赖 **未持久化的** 临时扫描结果且与主路径不一致。 - **再建快照 `s3`(file1 已 frozen)**:`SnapshotRefIndex` 对 `frozen_fid` **insert(s3)** 并 `snap_ref_count++`(与上表「再建快照」行一致)。 -- **fsck**:`snap_ref_count == |SnapshotRefIndex[fid]|`(允许索引与 `cow_inodes` 并集交叉校验)。 +- **fsck**:`snap_ref_count == |SnapshotRefIndex[fid]|`(允许索引与 `cow_inodes`、`dir_diffs.deleted` 并集交叉校验)。 -**成熟度说明**:引用计数为业界成熟手段,但须在 **COW 初值 / 多快照叠加 / delete + replay / 索引一致性** 上做 **专项测试**,列入 **P3.1 验收**。 +**成熟度说明**:引用计数为业界成熟手段,但须在 **COW 初值 / 目录 diff frozen 引用 / 多快照叠加 / delete + replay / 索引一致性** 上做 **专项测试**,列入 **P3.1 验收**。 -#### 6.3.4 与 §6.4 Checkpoint 的边界 +#### 6.3.5 与 §6.4 Checkpoint 的边界 | | §6.3 用户快照 | §6.4 Checkpoint/FSImage | |--|----------------|-------------------------| | 目的 | 时间点恢复、误删回滚、对比历史 | MetaServer **重启/冷备**、缩短 replay | | 创建成本 | **O(1)** per snap | O(namespace) 后台扫描(可模糊) | | 读路径 | 快照视图 | 正常命名空间 | -| 存储 | 内存 Ref + 被 COW 分离的 inode | 磁盘 FSImage 文件 | +| 存储 | 内存 Ref + 目录 diff + 被 COW 分离的 inode | 磁盘 FSImage 文件 | 两者可同时存在:HDFS 亦区分 **Snapshot** 与 **Checkpoint(FSImage)**。 -#### 6.3.5 未采纳为用户快照的方案 +#### 6.3.6 未采纳为用户快照的方案 | 方案 | 结论 | |------|------| @@ -671,18 +711,21 @@ deleteSnapshot(snap_id): triggerCheckpoint() // 周期或 MetaCheckpoint RPC ├─ 记录 LAST_TXN_ID = committed_txn_id (N) ├─ 后台线程遍历 InodeTable、DirTable(§4)、BlockMap(可选) - │ 允许与写并发;图像可「模糊」 - ├─ 写出 FSImage + footer(N) + │ 允许与写并发,但只序列化 txn_id <= N 的 committed 视图 + │ 忽略 pending txn>N;对 delete_txn>N 的旧版本仍按 N 时刻保留 + ├─ 写出 FSImage + footer(N);每个 section 带 section checksum 和 max_txn_seen<=N └─ 原子 publish 冷启动:load FSImage(N) → replay Edit Log (txn_id > N) → 一致 ``` -正确性:依赖 §6.7 之「先 Edit Log 再内存 / committed 边界」;模糊项由 replay 修正(同原 §6.3.2 论证)。 +正确性:依赖 §6.7 的版本化可见性边界。Checkpoint 可以与写并发,但 **不能**把 txn>N 的新 dentry/inode 写入 FSImage(N),否则冷启动 replay(txn>N) 会重复 create、复活已删除对象或双加计数。实现必须在扫描时按 `create_txn <= N < delete_txn` 过滤,或在 FSImage 记录中携带版本并在 load 阶段过滤。 #### 6.4.2 FSImage 内容与 Large 目录 -- `section_inodes`、`section_dirs`(Small 桶或嵌入 **`kfstree` checkpoint 流**)。 +- `section_inodes`、`section_dirs`(Small 逻辑项或 Large `DirBTree` 记录流),记录必须带可过滤的 create/delete txn 或保证已经按 N 过滤。 +- `section_snapshots`:只写 `create_txn <= N` 且未在 N 前删除的 `SnapshotRecord`,包括 `cow_inodes`、`dir_diffs` 与可重建 `SnapshotRefIndex` 的记录;`DIR_SNAPSHOT_DIFF_UPDATE` 中 txn>N 的变化不得进入 FSImage(N)。 +- `section_blockmap`(可选):若写入,则与 inode 一样按版本过滤,避免 replay 后重复块引用计数。 - log 截断:**可选**运维操作,非恢复前提。 #### 6.4.3 代价 @@ -714,11 +757,13 @@ triggerCheckpoint() // 周期或 MetaCheckpoint RPC **提议默认顺序**: ```text -(分片锁内)改内存 → append 到 log 内存 buffer → 释放锁 +reserve txn_id +(分片锁内)写 pending 版本 → append 到 log 内存 buffer → 释放锁 (log 线程)buffer → 复制 → fsync → 推进 committed_txn_id +(发布阶段)txn_id <= committed_txn_id 的 pending 版本进入 committed 视图 ``` -对比 QFS:**先 log committed 再 `handle()`**,客户端等待包含「空窗期」内无法从内存读到结果的双重延迟。本 RFC 的可见性边界见 **§6.7**:其他客户端以 **已提交命名空间** 为准;发起方在 RPC 成功后的可见范围与 **lease / sync 策略** 对齐 HDFS 习惯,而非「未提交 txn 全网可见」。 +对比 QFS:**先 log committed 再 `handle()`**,客户端等待包含「空窗期」内无法从内存读到结果的双重延迟。本 RFC 将内存修改拆成 **pending 版本** 与 **committed 视图**:写路径可以先构造 pending 状态并排队 fsync,但普通读路径只能读 committed 视图。发起方在 RPC 成功后的可见范围与 **lease / sync 策略** 对齐 HDFS 习惯,而非「未提交 txn 全网可见」。 ### 6.7 读一致性(已决) @@ -726,8 +771,8 @@ triggerCheckpoint() // 周期或 MetaCheckpoint RPC | 场景 | 规则 | |------|------| -| **命名空间变更**(CREATE / REMOVE / RENAME …) | 对其他客户端:仅在 edit **已 committed**(`committed_txn_id` 推进、quorum 复制完成)后可见;primary 内存中未 fsync 的 buffer **不**对外暴露。 | -| **RPC 返回与 durable** | `sync=always`:成功返回 ≡ 命名空间变更已 durable,他客户端可见(在 primary 正常服务前提下)。`sync=batch`:返回表示 **已接受**;他客户端可见时点不早于本批 **组提交 fsync**(类比 HDFS edit 组提交窗口)。 | +| **命名空间变更**(CREATE / REMOVE / RENAME …) | 对其他客户端:仅在 edit **已 committed**(`committed_txn_id` 推进、quorum 复制完成)后可见;primary 内存中的 pending 版本 **不**进入普通读视图。 | +| **RPC 返回与 durable** | `sync=always`:成功返回 ≡ 命名空间变更已 durable,他客户端可见(在 primary 正常服务前提下)。`sync=batch`:返回表示 **已接受并分配 txn/fid**;他客户端可见时点不早于本批 **组提交 fsync**。若需要 read-your-writes,必须用会话 token 或等待 txn committed。 | | **文件数据读写** | 命名空间登记(create 得 fid)与 **写数据** 分离;已打开文件的读写一致性由 **chunk lease** 保证写者独占/租约续期,读者看到已提交块版本,与 HDFS 「NN 管名字、DN 管块 + lease」分工一致。 | | **Primary / standby** | 仅 primary 执行 namespace 变更并写 edit;standby 通过 log replay 追赶;客户端 mutating 与强一致命名空间读面向 primary(与现 VR 一致)。 | @@ -736,7 +781,7 @@ triggerCheckpoint() // 周期或 MetaCheckpoint RPC - **(a) 仅 primary 本地可见未提交变更**:不足以定义多客户端语义,且与 backup 复制模型冲突。 - **(b) 未提交 txn 全网可见**:破坏恢复与 fsck 假设,并引入跨客户端脏读。 -**实现提示**:可在 `Inode` 或目录上保留 `last_committed_txn`;`lookup` / `readdir` 仅暴露 `txn_id ≤ committed_txn_id` 的视图;写路径 lease 逻辑复用现有 QFS 实现,本层不新增第二套租约协议。 +**实现提示**:DirEntry/Inode 需要携带 `create_txn`、`delete_txn`(或等价版本区间)与 pending 标志;`lookup` / `readdir` 只暴露 `create_txn <= committed_txn_id < delete_txn` 的视图。commit 发布可以批量翻转 pending,也可以只推进全局 `committed_txn_id` 并在读路径过滤。写路径 lease 逻辑复用现有 QFS 实现,本层不新增第二套租约协议。 --- @@ -750,14 +795,36 @@ triggerCheckpoint() // 周期或 MetaCheckpoint RPC | `InodeTable` | `hash(fid) % M` 分片;读多写少用 RW lock | | `FidAllocator` | 无锁原子或独立 mutex | | `EditLog buffer` | 单写者 + MPSC 队列 | -| `PathCache` | RCU 或 per-shard 锁 | +| 客户端 `PathCache` | 客户端本地缓存,不在 MetaServer 锁层次内 | **禁止**:所有 mutating RPC 共用一个 `submit_request` 全局 mutex(现状瓶颈)。 ### 7.2 与 B+ 树分片锁的区别 对 **全局 `metatree`(单例 B+ 树)**,「按 parent 加锁」**不安全**(不同目录可能 split 同一内部节点,见 `MetaTree-Lock-Optimization.md`)。 -对 **DirTable 分片**:按 `parent_fid` 加锁 **安全**——Small 为独立 `flat_hash`;Large 为 **该目录专属 `Tree` 实例**(仍用 `kfstree`,但不与别目录共享内部节点)。 +对 **DirTable 分片**:按 `parent_fid` 加锁 **安全**——Small 为独立 `flat_hash`;Large 为 **该目录专属 `DirBTree` 实例**(抽取/适配 `kfstree` 节点算法,但不与别目录共享内部节点)。 + +### 7.3 跨目录操作锁顺序(已决) + +`rename`、dumpster move、快照 COW / 目录 diff 更新会同时触碰多个目录、inode、BlockMap 与快照元数据,必须使用全局确定性锁顺序,禁止按调用路径临时加锁。 + +**锁顺序**: + +```text +1. SnapshotRegistry / SnapshotRefIndex 元数据锁(仅快照相关操作) +2. DirShard locks,按 (shard_id, parent_fid) 升序;同一目录只加一次 +3. InodeTable locks,按 fid 升序 +4. BlockMap locks,按 fid 升序 +5. EditLog append buffer(只追加内存 buffer,不在锁内等待 fsync) +``` + +规则: + +- `RENAME(src_parent, name, dst_parent, new_name)`:先按 `(shard_id, parent_fid)` 顺序拿源/目标父目录写锁;在锁内重新校验源项存在、目标项冲突、权限和 generation;再写入同一个 txn 的 pending `delete(src)` + `create(dst)`,并更新 inode parent/name 与目录 diff。 +- `remove(..., todumpster=true)`:视为从源父目录 rename 到 `dumpster_fid`,按同一 DirShard 顺序加锁,不给 dumpster 单独开后门锁。 +- 快照 COW / diff:先拿 snapshot 元数据锁,确定受影响的 `snap_id` / `DirSnapshotDiff` / `SnapshotRefIndex`,再按目录和 fid 顺序加锁;不得持有低层锁后再回头等待 snapshot 元数据锁。 +- 冲突处理:多资源操作使用 `try_lock` + 释放已持有锁 + 退避重试,避免 ABBA;禁止在持有另一把目录锁时做读锁升级为写锁。 +- `EditLog` 只在已完成内存 pending 版本后 append buffer;`fsync` / quorum 等待发生在释放业务锁之后。 --- @@ -823,7 +890,7 @@ Phase A — InodeTable Phase B — DirTable(每个目录 fid) 按 DirNode.layout 枚举: - **SMALL**:flat_hash 全桶扫描,校验无重复 NameKey、探测链有界 - - **LARGE**:遍历该目录专属 `Tree` 叶(同现 `kfstree` 迭代),校验 `Key(KFS_DENTRY, parent, hash)` 与 name 唯一 + - **LARGE**:遍历该目录专属 `DirBTree` 叶(同现 `kfstree` 迭代),校验 `Key(KFS_DENTRY, parent, hash)` 与 name 唯一 对每条 DirEntry (name → child_fid): - InodeTable[child_fid] 存在且 parent_fid == 当前目录 fid 对 InodeTable 中 type=dir 的项: @@ -839,10 +906,12 @@ Phase C — 双向一致 Phase D — 与 edit committed 视图一致(可选在线 fsck) 仅扫描 txn_id ≤ committed_txn_id 的视图(§6.7) -Phase E — 用户快照引用计数(§6.3.6) +Phase E — 用户快照与目录 diff(§6.3.1.1 / §6.3.4) - 对每个 SnapshotRecord:cow_inodes 中 fid 存在且 snap_ref_count >= 1 - - 对每个 snap_ref_count > 0 的 inode:至少被一个 SnapshotRecord.cow_inodes 引用 - - 无 snap_ref_count == 0 且仅被快照元数据悬挂的 unreachable frozen + - 对每个 DirSnapshotDiff:所属 snap_id 存在,base_txn <= committed_txn_id,created/deleted 的 NameKey 无重复 + - 对每个 DirSnapshotDiff.deleted 中的 frozen_fid:inode 存在,snap_ref_count >= 1,SnapshotRefIndex 包含该 snap_id + - 对每个 snap_ref_count > 0 的 inode:至少被 SnapshotRecord.cow_inodes、dir_diffs.deleted 或 SnapshotRefIndex 引用 + - snap_ref_count == |SnapshotRefIndex[fid]|;无 snap_ref_count == 0 且仅被快照元数据悬挂的 unreachable frozen ``` 报告格式可继续兼容现 `MetaFsck` / `kfsfsck` 客户端字段;内部扫描源从 `metatree` 迭代改为 **InodeTable + DirIndex 枚举**。 @@ -858,7 +927,7 @@ Phase E — 用户快照引用计数(§6.3.6) | **P2** | `DirTable`(§4.2 Small+Large+promotion)+ `InodeTable`;百万级单目录基准 | 高 | | **P2.1** | §8.4 特殊路径 + §8.5 fsck(含两种 DirNode layout) | 可运维 | | **P3** | v2 edit + §6.4 Checkpoint(FSImage N + replay)+ 冷启动闭环 | 很高 | -| **P3.1** | §6.3 用户快照 + §6.3.6 `snap_ref_count`(COW/delete/replay/fsck 测试) | 可回滚目录 | +| **P3.1** | §6.3 用户快照 + 目录 Diff + §6.3.4 `snap_ref_count`(COW/delete/replay/fsck 测试) | 可回滚目录 | (**范围外**:多 MetaServer namespace 分片、BlockMap 独立服务、inode 换出等,不列入本 RFC 路线图。) @@ -879,8 +948,11 @@ Phase E — 用户快照引用计数(§6.3.6) | Promotion 原子性与可见性 | §4.2.6 | | Promotion 墙钟上限(写者饥饿) | §4.2.6 | | Readdir cookie 逻辑位置(`NameKey`) | §5.4.1 | -| 用户快照(HDFS 式 Ref + 文件级 COW) | §6.3 | -| Frozen inode `snap_ref_count` + 倒排索引 | §6.3.6 | +| 用户快照(HDFS 式 Ref + 文件级 COW + 目录 Diff) | §6.3 | +| Frozen inode `snap_ref_count` + 倒排索引 | §6.3.4 | +| 目录快照 Diff | §6.3.1.1 | +| Pending / committed 视图 | §6.6 / §6.7 | +| 跨目录锁顺序 | §7.3 | | Checkpoint/FSImage(一致性点 + 后台遍历) | §6.4 | | Checkpoint 扫描内存与节流 | §6.4.4 | @@ -895,7 +967,7 @@ Phase E — 用户快照引用计数(§6.3.6) | 保留全局 B+ 树,仅优化锁 | 无法消除双 insert 与树分裂;并发上限低(见 `MetaTree-Lock-Optimization.md`) | | 仅全局 B+ 树 | 已否决;见 §4.2.5 | | 单目录百万项仍用平铺 HashMap+链表 | **已否决**;首版必须 Large 布局 + promotion | -| 每目录一棵 B+ 树(Large 布局) | **已采纳**,**复用 `kfstree`**,仅用于 `child_count ≥ threshold` 的目录 | +| 每目录一棵 B+ 树(Large 布局) | **已采纳**,**抽取/适配 `kfstree` 节点算法**,仅用于 `child_count ≥ threshold` 的目录 | | 自研另一套目录 B-tree 实现 | **已否决**,与现网重复且难保持 checkpoint 一致 | | 纯 tmpfs、无持久化 | 不符合 QFS 定位 | | 完全照搬 RocksDB/LSM 存 namespace | 写放大与 create 延迟不如 hash + edit log 直接 | @@ -920,7 +992,7 @@ Phase E — 用户快照引用计数(§6.3.6) ## 13. 参考文献(仓库内) -- `src/cc/meta/kfstree.h` / `kfstree.cc` — B+ 树(Large 目录 **复用** 本实现;全局 `metatree` 不再用于 namespace dentry) +- `src/cc/meta/kfstree.h` / `kfstree.cc` — B+ 树(Large 目录 **抽取/适配** 节点算法;全局 `metatree` 不再用于 namespace dentry) - `src/cc/meta/kfsops.cc` — `Tree::create` / `link` 双 `insert` - `src/cc/meta/MetaRequest.cc` — `MetaCreate::start` / `handle`,`SubmitBegin` - `src/cc/meta/LogWriter.cc` — `Enqueue`、`WriteLog`、`fsync` @@ -943,10 +1015,11 @@ Phase E — 用户快照引用计数(§6.3.6) | 0.5 | 2026-05-25 | 范围限定单机内存;去掉 namespace 水平分片/换出开放项与 P4 路线图 | | 0.6 | 2026-05-25 | §8.4 特殊路径、§8.5 fsck 已决;§10 无剩余开放项 | | 0.7 | 2026-05-25 | §4.2 大目录首版必做:Small flat_hash + Large 每目录 B+ 树 + promotion | -| 0.8 | 2026-05-25 | Large 布局明确复用现 `kfstree`(`Tree`/`Node`/`Key`/`MetaDentry`),不新写 B-tree | +| 0.8 | 2026-05-25 | Large 布局明确抽取/适配现 `kfstree` 节点算法(`Node`/`Key`/`MetaDentry`),不新写 B-tree | | 0.9 | 2026-05-25 | §6.3 已决:一致性点 + 后台模糊 FSImage + replay(txn>N) | | 1.0 | 2026-05-25 | §6.3 改为 HDFS 式用户快照(InodeRef+文件级COW);§6.4 为 Checkpoint/FSImage | | 1.1 | 2026-05-25 | §4.2.6 Promotion:`PROMOTING` 状态、staging、读 Small/写等待、原子发布 | -| 1.2 | 2026-05-25 | §6.3.6:`snap_ref_count`、COW/删快照维护、fsck 与无环不变量 | +| 1.2 | 2026-05-25 | §6.3.4:`snap_ref_count`、COW/删快照维护、fsck 与无环不变量 | | 1.3 | 2026-05-25 | §5.4.1:readdir cookie 用逻辑 key 游标,禁止 LeafIter/节点指针 | -| 1.4 | 2026-05-25 | §4.2.6 晋升墙钟上限;§5.4.1 `last_key`;§6.3.6 倒排索引;§6.4.4 checkpoint 内存 | +| 1.4 | 2026-05-25 | §4.2.6 晋升墙钟上限;§5.4.1 `last_key`;§6.3.4 倒排索引;§6.4.4 checkpoint 内存 | +| 1.5 | 2026-05-25 | 补充 pending/committed 视图、目录快照 diff、checkpoint 版本过滤、跨目录锁顺序 | From a4dfe4205247582af58fd337673055377736b674 Mon Sep 17 00:00:00 2001 From: zhangzhibiao Date: Mon, 1 Jun 2026 16:26:01 +0800 Subject: [PATCH 7/7] Optimize QFS metadata and write path --- benchmarks/mstress/mstress_client.cc | 140 +- clean_start_cluster.sh | 49 + docs/rfc/RFC-0002-hdfs-like-write-flow.md | 161 + ...FC-0003-write-path-optimization-summary.md | 217 ++ press.sh | 11 + src/cc/chunk/ChunkManager.cc | 120 +- src/cc/chunk/ChunkManager.h | 6 + src/cc/chunk/ClientSM.cc | 7 +- src/cc/chunk/KfsOps.cc | 145 +- src/cc/chunk/KfsOps.h | 48 +- src/cc/chunk/LeaseClerk.cc | 2 +- src/cc/common/BufferedLogWriter.cc | 2 +- src/cc/kfsio/IOBuffer.cc | 23 + src/cc/kfsio/IOBuffer.h | 4 + src/cc/libclient/KfsClient.cc | 262 +- src/cc/libclient/KfsClientInt.h | 25 +- src/cc/libclient/KfsOps.cc | 29 +- src/cc/libclient/KfsOps.h | 24 +- src/cc/libclient/KfsProtocolWorker.cc | 8 +- src/cc/libclient/KfsProtocolWorker.h | 7 +- src/cc/libclient/WriteAppender.cc | 2 + src/cc/libclient/Writer.cc | 736 +++- src/cc/libclient/Writer.h | 64 +- src/cc/meta/CMakeLists.txt | 19 +- src/cc/meta/Checkpoint.cc | 4 + src/cc/meta/ChunkServer.cc | 32 +- src/cc/meta/ChunkServer.h | 1 + src/cc/meta/ClientManager.h | 4 + src/cc/meta/ClientSM.h | 2 + src/cc/meta/LayoutManager.cc | 126 +- src/cc/meta/LayoutManager.h | 5 + src/cc/meta/LogWriter.cc | 337 +- src/cc/meta/MetaRequest.cc | 630 +++- src/cc/meta/MetaRequest.h | 50 +- src/cc/meta/NamespaceV2.cc | 2969 +++++++++++++++++ src/cc/meta/NamespaceV2.h | 457 +++ src/cc/meta/NetDispatch.cc | 105 +- src/cc/meta/NetDispatch.h | 3 + src/cc/meta/Replay.cc | 189 ++ src/cc/meta/Replay.h | 1 + src/cc/meta/Restorer.cc | 161 + src/cc/meta/namespacev2bench_main.cc | 367 ++ src/cc/meta/namespacev2test_main.cc | 886 +++++ src/cc/meta/namespacev2walreplaytest_main.cc | 169 + src/cc/qcdio/QCThread.cc | 7 +- src/cc/tools/qfsput_main.cc | 9 +- 46 files changed, 8464 insertions(+), 161 deletions(-) create mode 100755 clean_start_cluster.sh create mode 100644 docs/rfc/RFC-0002-hdfs-like-write-flow.md create mode 100644 docs/rfc/RFC-0003-write-path-optimization-summary.md create mode 100755 press.sh create mode 100644 src/cc/meta/NamespaceV2.cc create mode 100644 src/cc/meta/NamespaceV2.h create mode 100644 src/cc/meta/namespacev2bench_main.cc create mode 100644 src/cc/meta/namespacev2test_main.cc create mode 100644 src/cc/meta/namespacev2walreplaytest_main.cc diff --git a/benchmarks/mstress/mstress_client.cc b/benchmarks/mstress/mstress_client.cc index 96a7c1620..87d69ab3f 100644 --- a/benchmarks/mstress/mstress_client.cc +++ b/benchmarks/mstress/mstress_client.cc @@ -37,6 +37,8 @@ #include #include #include +#include +#include #if __cplusplus >= 201103L #include @@ -45,12 +47,40 @@ using namespace std; #include "libclient/KfsClient.h" +#include "common/Properties.h" FILE* logFile = stdout; #define TEST_BASE_DIR "/mstress" #define COUNT_INCR 500 +struct WriteTimingStats { + int64_t openUsec; + int64_t writeUsec; + int64_t closeUsec; + int64_t openCount; + int64_t writeCount; + int64_t closeCount; + + WriteTimingStats() + : openUsec(0), + writeUsec(0), + closeUsec(0), + openCount(0), + writeCount(0), + closeCount(0) + {} +}; + +static WriteTimingStats gWriteTimingStats; + +static int64_t NowUsec() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (int64_t)tv.tv_sec * 1000000 + (int64_t)tv.tv_usec; +} + /* This program is invoked with the following arguments: - qfs server/port @@ -139,6 +169,7 @@ struct Client { int levels_; int inodesPerLevel_; int pathsToStat_; + int64_t fileSize_; }; const size_t Client::INITIAL_SIZE = 1 << 12; @@ -148,7 +179,17 @@ class AutoCleanupKfsClient public: AutoCleanupKfsClient(Client* client) : initialized(false) { - kfsClient = KFS::Connect(client->dfsServer_, client->dfsPort_); + const char* const config = getenv("QFS_CLIENT_CONFIG"); + if (config && config[0]) { + KFS::Properties props; + if (props.loadProperties(config, '=') == 0) { + kfsClient = KFS::Connect(client->dfsServer_, client->dfsPort_, &props); + } else { + kfsClient = 0; + } + } else { + kfsClient = KFS::Connect(client->dfsServer_, client->dfsPort_); + } if (kfsClient) { initialized = true; } @@ -190,6 +231,23 @@ void myitoa(int n, char* buf, size_t len = 32) snprintf(buf, len, "%d", n); } +static void DumpQfsClientStats(KFS::KfsClient* kfsClient, const char* tag) +{ + if (! kfsClient) { + return; + } + KFS::Properties* const stats = kfsClient->GetStats(); + if (! stats) { + return; + } + fprintf(logFile, "\n=== qfs_client stats (%s) ===\n", (tag ? tag : "")); + for (KFS::Properties::iterator it = stats->begin(); it != stats->end(); ++it) { + fprintf(logFile, "%s=%s\n", it->first.c_str(), it->second.c_str()); + } + fprintf(logFile, "=== end qfs_client stats ===\n\n"); + delete stats; +} + //Return a random permutation of numbers in [0..range). void unique_random(vector& result, size_t range) { @@ -292,6 +350,7 @@ void ParsePlanFile(Client* client) { string line; ifstream ifs(client->planfilePath_.c_str(), ifstream::in); + client->fileSize_ = 0; while (ifs.good()) { getline(ifs, line); @@ -314,6 +373,10 @@ void ParsePlanFile(Client* client) client->pathsToStat_ = atoi(line.substr(6).c_str()); continue; } + if (line.substr(0, 9) == "filesize=") { + client->fileSize_ = atoll(line.substr(9).c_str()); + continue; + } } ifs.close(); if (client->levels_ <= 0 || client->inodesPerLevel_ <= 0 || client->type_.empty()) { @@ -355,6 +418,7 @@ int CreateDFSPaths(Client* client, AutoCleanupKfsClient* kfs, int level, int* cr (*createdCount)++; if (*createdCount > 0 && (*createdCount) % COUNT_INCR == 0) { fprintf(logFile, "Created paths so far: %d\n", *createdCount); + fflush(logFile); } if (!isLeaf) { rc = CreateDFSPaths(client, kfs, level+1, createdCount); @@ -365,14 +429,56 @@ int CreateDFSPaths(Client* client, AutoCleanupKfsClient* kfs, int level, int* cr } } else { //fprintf(logFile, "Creating file [%s]\n", client->path_.actualPath_); - rc = kfsClient->Create(client->path_.String()); + const int64_t openStartUsec = NowUsec(); + rc = client->fileSize_ > 0 ? + kfsClient->Open(client->path_.String(), O_CREAT|O_RDWR) : + kfsClient->Create(client->path_.String()); + if (client->fileSize_ > 0) { + gWriteTimingStats.openUsec += NowUsec() - openStartUsec; + gWriteTimingStats.openCount++; + } if (rc < 0) { fprintf(logFile, "Create(%s) failed with rc=%d\n", client->path_.String(), rc); return rc; } + if (client->fileSize_ > 0) { + static const size_t kWriteBufSize = 1 << 20; + if (*createdCount == 0) { + fprintf(logFile, "Writing %lld bytes per file...\n", + (long long)client->fileSize_); + fflush(logFile); + } + static vector sWriteBuf(kWriteBufSize, 'x'); + int64_t remaining = client->fileSize_; + while (remaining > 0) { + const size_t len = (remaining < (int64_t)kWriteBufSize) ? + (size_t)remaining : kWriteBufSize; + const int64_t writeStartUsec = NowUsec(); + const ssize_t wr = kfsClient->Write(rc, &sWriteBuf[0], len); + gWriteTimingStats.writeUsec += NowUsec() - writeStartUsec; + gWriteTimingStats.writeCount++; + if (wr != (ssize_t)len) { + fprintf(logFile, "Write(%s) failed expected=%zu actual=%ld\n", + client->path_.String(), len, (long)wr); + kfsClient->Close(rc); + return (wr < 0 ? (int)wr : -EIO); + } + remaining -= wr; + } + const int64_t closeStartUsec = NowUsec(); + const int closeErr = kfsClient->Close(rc); + gWriteTimingStats.closeUsec += NowUsec() - closeStartUsec; + gWriteTimingStats.closeCount++; + if (closeErr < 0) { + fprintf(logFile, "Close(%s) failed with rc=%d\n", + client->path_.String(), closeErr); + return closeErr; + } + } (*createdCount)++; if (*createdCount > 0 && (*createdCount) % COUNT_INCR == 0) { fprintf(logFile, "Created paths so far: %d\n", *createdCount); + fflush(logFile); } } client->path_.Pop(name); @@ -407,6 +513,32 @@ int CreateDFSPaths(Client* client, AutoCleanupKfsClient* kfs) struct timeval tvZigma; gettimeofday(&tvZigma, NULL); fprintf(logFile, "Client: %d paths created in %ld msec\n", createdCount, TimeDiffMilliSec(&tvAlpha, &tvZigma)); + fflush(logFile); + if (client->fileSize_ > 0) { + const long totalMsec = TimeDiffMilliSec(&tvAlpha, &tvZigma); + fprintf(logFile, "Client: %lld bytes written in %ld msec\n", + (long long)createdCount * (long long)client->fileSize_, + totalMsec); + fprintf(logFile, + "Client write timing: open count=%lld total=%lld usec avg=%lld usec\n", + (long long)gWriteTimingStats.openCount, + (long long)gWriteTimingStats.openUsec, + (long long)(gWriteTimingStats.openCount ? + gWriteTimingStats.openUsec / gWriteTimingStats.openCount : 0)); + fprintf(logFile, + "Client write timing: write count=%lld total=%lld usec avg=%lld usec\n", + (long long)gWriteTimingStats.writeCount, + (long long)gWriteTimingStats.writeUsec, + (long long)(gWriteTimingStats.writeCount ? + gWriteTimingStats.writeUsec / gWriteTimingStats.writeCount : 0)); + fprintf(logFile, + "Client write timing: close count=%lld total=%lld usec avg=%lld usec\n", + (long long)gWriteTimingStats.closeCount, + (long long)gWriteTimingStats.closeUsec, + (long long)(gWriteTimingStats.closeCount ? + gWriteTimingStats.closeUsec / gWriteTimingStats.closeCount : 0)); + fflush(logFile); + } return 0; } @@ -446,6 +578,7 @@ int StatDFSPaths(Client* client, AutoCleanupKfsClient* kfs) { if (count > 0 && count % COUNT_INCR == 0) { fprintf(logFile, "Stat paths so far: %d\n", count); + fflush(logFile); } } @@ -496,6 +629,7 @@ int ListDFSPaths(Client* client, AutoCleanupKfsClient* kfs) { children.pop_back(); if (inodeCount > 0 && inodeCount % COUNT_INCR == 0) { fprintf(logFile, "Readdir paths so far: %d\n", inodeCount); + fflush(logFile); } } } @@ -588,6 +722,7 @@ int RemoveDFSPaths(Client* client, AutoCleanupKfsClient* kfs) { int main(int argc, char* argv[]) { Client client; + setvbuf(logFile, NULL, _IOLBF, 0); parse_options(argc, argv, &client); @@ -612,6 +747,7 @@ int main(int argc, char* argv[]) fprintf(logFile, "Error: unrecognized test '%s'", client.testName_.c_str()); return -1; } + DumpQfsClientStats(kfs.GetClient(), client.testName_.c_str()); return result; } diff --git a/clean_start_cluster.sh b/clean_start_cluster.sh new file mode 100755 index 000000000..228b86ce7 --- /dev/null +++ b/clean_start_cluster.sh @@ -0,0 +1,49 @@ + pkill -f 'mstress_100k_1m_file' + pkill -f 'mstress_client' + pkill -f 'metaserver' + pkill -f 'chunkserver' + +# clean cluster + + cd /work/bigo-qfs + TS=$(date +%Y%m%d_%H%M%S) + + rm -f qfsbase/meta/metaserver.pid \ + qfsbase/chunk1/chunkserver.pid \ + qfsbase/chunk2/chunkserver.pid \ + qfsbase/chunk3/chunkserver.pid + + mv qfsbase/meta/logs qfsbase/meta/logs.bak.$TS 2>/dev/null || true + mv qfsbase/meta/checkpoints qfsbase/meta/checkpoints.bak.$TS 2>/dev/null || true + mkdir -p qfsbase/meta/logs qfsbase/meta/checkpoints + + for p in \ + qfsbase/chunk1/chunkdir11 \ + qfsbase/chunk1/chunkdir12 \ + qfsbase/chunk2/chunkdir21 \ + qfsbase/chunk3/chunkdir31 + do + mv "$p" "$p.bak.$TS" 2>/dev/null || true + mkdir -p "$p" + done + +# start meta + cd /work/bigo-qfs + + bld/output/bin/metaserver -c qfsbase/meta/conf/MetaServer.prp qfsbase/meta/MetaServer.log + + setsid -f bld/output/bin/metaserver \ + qfsbase/meta/conf/MetaServer.prp \ + qfsbase/meta/MetaServer.log \ + >> qfsbase/meta/MetaServer.out 2>&1 + +# start chunk + cd /work/bigo-qfs + + setsid -f bld/output/bin/chunkserver qfsbase/chunk1/conf/ChunkServer.prp qfsbase/chunk1/ChunkServer.log > qfsbase/chunk1/ChunkServer.out 2>&1 + setsid -f bld/output/bin/chunkserver qfsbase/chunk2/conf/ChunkServer.prp qfsbase/chunk2/ChunkServer.log > qfsbase/chunk2/ChunkServer.out 2>&1 + setsid -f bld/output/bin/chunkserver qfsbase/chunk3/conf/ChunkServer.prp qfsbase/chunk3/ChunkServer.log > qfsbase/chunk3/ChunkServer.out 2>&1 + +sleep 6 + +bld/output/bin/tools/qfsping -m -s 202.168.115.34 -p 20000 diff --git a/docs/rfc/RFC-0002-hdfs-like-write-flow.md b/docs/rfc/RFC-0002-hdfs-like-write-flow.md new file mode 100644 index 000000000..41be84aaa --- /dev/null +++ b/docs/rfc/RFC-0002-hdfs-like-write-flow.md @@ -0,0 +1,161 @@ +# RFC-0002: HDFS-like Write Flow for QFS + +## Summary + +This plan introduces an optional HDFS-like write allocation path for QFS. +The goal is to reduce small-file write latency by removing the synchronous +metaserver-to-chunkserver pre-create step from the normal replicated write +allocation path. + +The existing QFS write flow is preserved by default. The new path is enabled +only when both metaserver and chunkserver switches are turned on. + +## Current QFS Write Flow + +For a normal replicated write, the current QFS path is: + +1. Client creates or opens a file and obtains a file id. +2. Client write enters `Writer`. +3. `Writer` sends `ALLOCATE` to metaserver when it needs a chunk. +4. Metaserver selects chunkservers, creates metadata, grants a write lease. +5. Metaserver sends `ALLOCATE_CHUNK` to chunkserver before replying to client. +6. `ALLOCATE_CHUNK` is logged as an in-flight metaserver-to-chunkserver op. +7. Chunkserver creates the local chunk and registers the lease. +8. Client receives allocation result. +9. Client sends `WRITE_ID_ALLOC` to chunkserver. +10. Client sends `WRITE_PREPARE` / `WRITE_SYNC` data RPCs. +11. Client close / sync waits for pending writes and sends `CLOSE_CHUNK`. + +The hot part for small-file writes is step 5-7. In benchmark investigation, +the pre-RPC in-flight logging before `ALLOCATE_CHUNK` dominated allocation +latency. + +## Target HDFS-like Compatible Flow + +With the new optional path: + +1. Client creates or opens a file and obtains a file id. +2. Client write enters `Writer`. +3. `Writer` sends `ALLOCATE` to metaserver when it needs a chunk. +4. Metaserver selects chunkservers, creates metadata, grants a write lease. +5. Metaserver replies to client without sending `ALLOCATE_CHUNK`. +6. Client sends `WRITE_ID_ALLOC` with `File-handle` and `Lease-id`. +7. Chunkserver lazily creates the chunk if it does not exist. +8. Chunkserver registers the write lease locally. +9. Client sends `WRITE_PREPARE` / `WRITE_SYNC` data RPCs. +10. Client close / sync waits for pending writes and sends `CLOSE_CHUNK`. + +This keeps the existing QFS client write protocol mostly intact while moving +chunk creation from metaserver-driven pre-create to client-write-driven lazy +creation. + +## Config Switches + +Both switches must be enabled: + +```properties +metaServer.writeFlow.hdfsLikeAllocate = 1 +chunkServer.writeFlow.lazyCreateOnWrite = 1 +``` + +Default behavior remains the original QFS path: + +```properties +metaServer.writeFlow.hdfsLikeAllocate = 0 +chunkServer.writeFlow.lazyCreateOnWrite = 0 +``` + +## Implemented Changes + +Implemented in this branch: + +- Metaserver: + - Added `metaServer.writeFlow.hdfsLikeAllocate`. + - For normal replicated non-append, non-striped, non-object-store allocation, + metaserver can skip chunkserver pre-create. + - The allocation request is completed immediately through `LayoutDone()`. + - `MetaAllocate` response now includes `Lease-id` when available. + +- Client: + - `AllocateOp` now parses `Lease-id`. + - `WRITE_ID_ALLOC` now carries optional `File-handle` and `Lease-id`. + - `Writer` passes file id and lease id to `WRITE_ID_ALLOC`. + - `WriteAppender` also passes these fields, but append remains excluded from + the HDFS-like metaserver bypass path. + +- Chunkserver: + - Added `chunkServer.writeFlow.lazyCreateOnWrite`. + - `WRITE_ID_ALLOC` parses optional `File-handle` and `Lease-id`. + - If lazy-create is enabled and a normal write targets a missing chunk, + chunkserver creates the chunk in `AllocateWriteId()`. + - The write lease is registered locally from the lease id carried by the + client before the normal lease validation. + +## Not Fully Done + +This is not yet a complete production-grade HDFS write-flow replacement. + +- Client-CS auth and synchronous replication token semantics are not fully + reworked for lazy creation. +- Append, object-store, and striped files intentionally remain on the original + QFS allocation path. +- Crash-recovery behavior has not yet been validated with kill/restart tests. +- The current implementation still includes prior timing instrumentation in + the write path; decide later whether to keep or clean it up. +- The nested `MetaLogChunkAllocate` path now explicitly schedules a log flush + after enqueue; without this, HDFS-like allocate skipped chunkserver wait but + still waited for the metaserver log writer timeout cadence. + +## Test Plan + +Minimum validation before treating this as stable: + +1. Build: + - `cmake --build bld --target metaserver chunkserver qfsput mstress_client -j8` +2. Unit test: + - `./bld/output/bin/devtools/namespacev2test` +3. Functional write test: + - Enable both switches. + - Clean cluster and restart metaserver/chunkservers. + - Write a 1MB file with `qfsput`. + - Verify `qfs -ls` reports the expected size. + - Read the file back and verify byte count/content. +4. Benchmark: + - Run small-file write benchmark with old path. + - Run the same benchmark with HDFS-like path enabled. + - Compare total time, `Write.AllocateUsec`, close latency, and chunkserver + lazy-create failures. +5. Failure tests: + - Kill client after metaserver allocation but before write. + - Kill chunkserver after lazy create but before close. + - Restart and verify `HELLO` / `AVAILABLE_CHUNK` convergence. + +## Current Verification Status + +Completed: + +- Build passed for `metaserver`, `chunkserver`, `qfsput`, and `mstress_client`. +- `namespacev2test` passed. +- `git diff --check` passed. +- Clean-cluster functional 1MB `qfsput` passed: `qfs -ls` reported 1048576 bytes. +- HDFS-like lazy-create path was verified in chunkserver logs. +- 50 x 1MB write probe after fixing nested log flush: + - Total time: 366 ms. + - Previous HDFS-like run before the flush fix: 50454 ms. + - `Write.AllocateUsec`: 7587 usec total for 49 allocations, down from about 49048728 usec. + - Client close average: 6668 usec, down from about 1008198 usec. +- 1000 x 1MB write probe after the flush fix: + - Total time: 18265 ms. + - `Write.AllocateUsec`: 159286 usec total for 999 allocations. + - `Write.ChunkWriteUsec`: 16222757 usec total; the remaining dominant cost is chunk write/close, not metaserver allocate. + +Crash/restart validation: + +- Completed-file full restart passed: wrote `/recovery/ok_8m`, restarted metaserver and all chunkservers without cleaning logs or chunk dirs, then `qfscat` readback matched the original sha256. +- Interrupted writer restart exposed a correctness gap: killing a large `qfsput` left `/recovery/killed_stream` with a namespace size beyond a chunk that was lazy-created and written but not made stable. After restart, chunkservers deleted that dirty chunk as stale, while metaserver still had the chunk mapping; reading failed at offset 536870912 with `no replicas available chunk: 131084`. +- A simple attempt to create lazy chunks as initially stable was rejected by the existing write path (`WRITE_ID_ALLOC` returned `chunk stable`), so this needs a real recovery design rather than a shortcut. + +Pending: + +- Add proper client-crash recovery semantics for HDFS-like lazy-created chunks. Candidate fixes: lease recovery that makes the last dirty chunk stable, or metaserver-side truncation/mapping cleanup for chunks that never become stable. +- Full 100k-file benchmark if needed; the short and medium probes already verify the 1s allocate stall is fixed. diff --git a/docs/rfc/RFC-0003-write-path-optimization-summary.md b/docs/rfc/RFC-0003-write-path-optimization-summary.md new file mode 100644 index 000000000..ac7232476 --- /dev/null +++ b/docs/rfc/RFC-0003-write-path-optimization-summary.md @@ -0,0 +1,217 @@ +# RFC-0003: QFS Write Path Optimization Summary + +## Summary + +This document summarizes the write-path optimization work on branch +`lock-opt` and the proposed plan for preparing upstream pull requests. + +The overall direction is to reduce small-file create/write latency by: + +- reducing metaserver namespace lock contention, +- removing synchronous chunkserver pre-create from the hot allocation path, +- reusing chunkserver connections, +- replacing chain replication in the client write path with client-side fanout, +- avoiding duplicate checksum scans on every chunkserver replica, +- reducing avoidable buffer copies in fanout. + +The current benchmark focus is replicated 1 MB file creation with three +chunkservers. + +## Implemented Optimizations + +### Namespace / Metaserver + +- Added the NamespaceV2 implementation and tests. +- Reworked metadata operations toward finer-grained locking instead of a single + coarse global namespace lock. +- Added write transaction / WAL work needed by the v2 namespace path. +- Added batch apply / commit optimizations for high-frequency small + transactions. +- Added recovery validation tests for NamespaceV2 WAL replay. + +### HDFS-like Write Allocation + +- Added an optional HDFS-like allocation path: + - `metaServer.writeFlow.hdfsLikeAllocate` + - `chunkServer.writeFlow.lazyCreateOnWrite` +- The metaserver can allocate chunk metadata and return the lease to the client + without synchronously sending `ALLOCATE_CHUNK` to chunkservers. +- The chunkserver lazily creates the chunk on `WRITE_ID_ALLOC` when enabled. +- The client passes file id and lease id to `WRITE_ID_ALLOC`. +- Crash/restart validation found the expected incomplete-write gap and added a + recovery direction: truncate EOF to the last recoverable chunk instead of + trusting a namespace size that points past available stable replicas. + +### Client Chunkserver Connection Reuse + +- Added a chunkserver client pool for the write path. +- `mstress_client` now honors `QFS_CLIENT_CONFIG`, so benchmark runs can use the + same client config as normal tools. +- This removed connection churn from short-file write tests: + - `ChunkServer.Pool.Connect=3` + - `ChunkServer.Pool.OpsQueued=9000` for 1000 files with three fanout RPC + stages. + +### Parallel Replica Write Fanout + +- Added `client.parallelReplicaWrite` and enabled it by default in the test + configuration. +- Added `No-forward` / `NF` support for: + - `WRITE_ID_ALLOC` + - `WRITE_PREPARE` + - `WRITE_SYNC` + - `CLOSE` +- Client now sends write-id allocation, write prepare, and close RPCs directly + to all replicas instead of relying on chunkserver-to-chunkserver forwarding. +- The request still carries the full replica list so each chunkserver can derive + its own replica position and write id. + +### Fanout Buffer Sharing + +- Verified that `IOBufferData` already uses a ref-counted data block. +- Added `IOBuffer::AppendShared()` so fanout requests attach shared buffer + references directly. +- `Writer` now uses `AppendShared()` instead of creating a temporary cloned + `IOBuffer` for each replica fanout request. +- Payload bytes are not copied for fanout; each replica request holds a shared + reference and the data is released when the last reference drops. + +### Checksum Hot Path + +- Client now sends the 64 KB checksum vector in `WRITE_PREPARE` reply mode. +- Chunkserver can reuse the client-provided checksum vector for chunk metadata. +- With `chunkServer.skipWritePrepareChecksumVerify=1`, chunkserver skips the + duplicate payload scan in the write hot path and trusts the client-provided + checksum vector. +- Short RPC checksum vector output was fixed to preserve hex formatting for + subsequent short-format fields. + +This matches the HDFS-style tradeoff more closely: clients provide packet / +chunk checksums, datanodes store them, and later reads or scrubs verify stored +data against those checksums. + +## Latest Benchmark Snapshot + +Environment: + +- three local chunkservers, +- client and chunkservers use `202.168.115.34` instead of `localhost` to force + traffic through the network path, +- three replicas, +- 1 MB files, +- `client.parallelReplicaWrite=1`, +- `chunkServer.skipWritePrepareChecksumVerify=1`. + +### Single Client + +Plan: 1 client, 1000 files, 1 MB per file. + +```text +1000 files created in 3058 ms + +open avg: 240 us +write avg: 134 us +close avg: 2681 us + +Write.ChunkWriteUsec: 1735099 us +Write.CloseUsec: 2655539 us +Write.WriteIdAlloc: 225061 us +Write.ChunkClose: 114009 us + +ChunkServer.Pool.BytesSent: 3147979791 +ChunkServer.Pool.Connect: 3 +ChunkServer.Pool.OpsQueued: 9000 +``` + +Approximate throughput: + +- logical write throughput: about 327 MB/s, +- actual client network send: about 1.03 GB/s because every 1 MB file is sent + to three replicas. + +### Two Clients + +Plan: 2 clients, 1000 files per client, 1 MB per file. + +Before checksum-vector / skip-verify optimization: + +```text +proc_00: 4353 ms, Write.ChunkWriteUsec=2865775 +proc_01: 4349 ms, Write.ChunkWriteUsec=2863192 +``` + +After checksum-vector / skip-verify optimization: + +```text +proc_00: 4171 ms, Write.ChunkWriteUsec=2414194 +proc_01: 4209 ms, Write.ChunkWriteUsec=2435554 +``` + +`ChunkWriteUsec` dropped by about 14-16%. Total time dropped by about 3-4%. +The remaining cost is dominated by three-replica network fanout and chunk file +write / close work. + +## Correctness Notes + +- `chunkServer.skipWritePrepareChecksumVerify=1` changes write-time checksum + semantics: the chunkserver trusts the client-provided checksum vector instead + of recomputing checksums over the received payload. This is closer to the + HDFS write-path tradeoff, but it should be treated as a deliberate + configuration choice. +- The HDFS-like lazy-create path needs careful recovery semantics for killed + clients. The current direction is to truncate or repair namespace EOF to the + last recoverable stable chunk after restart. +- Append, striped files, object store files, and authenticated / tokenized + synchronous replication paths need separate review before enabling the new + write flow broadly. + +## Upstream PR Plan + +The current branch contains several related but separable changes. For upstream +review, split into smaller PRs: + +1. **Infrastructure / tests** + - NamespaceV2 tests and WAL replay tests. + - Benchmark client config loading through `QFS_CLIENT_CONFIG`. + - Minimal scripts or docs only if acceptable upstream. + +2. **NamespaceV2 / lock optimization** + - Finer-grained metadata locking. + - WAL / transaction correctness tests. + - Keep performance changes separate from protocol changes where possible. + +3. **HDFS-like lazy create** + - Config switches. + - Metaserver allocate bypass. + - Chunkserver lazy chunk creation on `WRITE_ID_ALLOC`. + - Recovery behavior must be completed before this is proposed as + production-ready. + +4. **Write connection reuse and fanout** + - Client chunkserver pool. + - `client.parallelReplicaWrite`. + - `No-forward` protocol support. + - Parallel `WRITE_ID_ALLOC`, `WRITE_PREPARE`, and `CLOSE`. + +5. **Checksum-vector hot-path optimization** + - Client sends block checksum vector in write-prepare reply mode. + - Chunkserver reuses the vector. + - Optional `chunkServer.skipWritePrepareChecksumVerify`. + +6. **Buffer sharing cleanup** + - `IOBuffer::AppendShared()`. + - Writer fanout uses shared buffer references instead of temporary clone + buffers. + +## Remaining Work + +- Add chunkserver-side detailed timing counters for: + - request parse, + - checksum handling, + - disk queue submit, + - disk completion latency. +- Finish crash/restart recovery for killed writers under lazy create. +- Re-run larger 100k-file tests after recovery semantics are finalized. +- Run compatibility tests with short RPC disabled and enabled. +- Run tests with `chunkServer.skipWritePrepareChecksumVerify=0` and `1` to make + the correctness/performance tradeoff explicit. diff --git a/press.sh b/press.sh new file mode 100755 index 000000000..32b2064ae --- /dev/null +++ b/press.sh @@ -0,0 +1,11 @@ + cd /work/bigo-qfs + + python bld/benchmarks/mstress/mstress.py \ + -m slave \ + -f qfs \ + -s localhost \ + -p 20000 \ + -t create \ + -a output/mstress_100k_1m_file.plan \ + -c localhost \ + -k localhost diff --git a/src/cc/chunk/ChunkManager.cc b/src/cc/chunk/ChunkManager.cc index 9348ab8fe..146759f13 100644 --- a/src/cc/chunk/ChunkManager.cc +++ b/src/cc/chunk/ChunkManager.cc @@ -102,7 +102,7 @@ struct ChunkManager::ChunkDirInfo : public ITimeout ChunkDirInfo() : ITimeout(), dirname(), - bufferedIoFlag(false), + bufferedIoFlag(true), storageTier(kKfsSTierUndef), usedSpace(0), availableSpace(-1), @@ -1792,7 +1792,7 @@ ChunkInfoHandle::Release(ChunkInfoHandle::ChunkLists* chunkInfoLists) if (! IsStable()) { UpdateDirStableCount(); } - KFS_LOG_STREAM_INFO << + KFS_LOG_STREAM_DEBUG << "closing chunk " << chunkInfo.chunkId << " version: " << chunkInfo.chunkVersion << " file handle: " << logFH << @@ -2126,7 +2126,7 @@ ChunkManager::ChunkManager() mMinPendingIoThreshold(8 << 20), mPlacementMaxWaitingAvgUsecsThreshold(5 * 60 * 1000 * 1000), mAllowSparseChunksFlag(true), - mBufferedIoFlag(false), + mBufferedIoFlag(true), mSyncChunkHeaderFlag(false), mCheckDirWritableFlag(true), mCheckDirTestWriteSize(16 << 10), @@ -2156,6 +2156,8 @@ ChunkManager::ChunkManager() mDiskBufferManagerEnabledFlag(true), mForceVerifyDiskReadChecksumFlag(false), mWritePrepareReplyFlag(true), + mSkipWritePrepareChecksumVerifyFlag(false), + mLazyCreateOnWriteFlag(false), mCryptoKeys(globalNetManager(), 0), mFileSystemId(-1), mFileSystemIdSuffix(), @@ -2577,6 +2579,12 @@ ChunkManager::SetParameters(const Properties& prop) mWritePrepareReplyFlag = prop.getValue( "chunkServer.debugTestWriteSync", mWritePrepareReplyFlag ? 0 : 1) == 0; + mSkipWritePrepareChecksumVerifyFlag = prop.getValue( + "chunkServer.skipWritePrepareChecksumVerify", + mSkipWritePrepareChecksumVerifyFlag ? 1 : 0) != 0; + mLazyCreateOnWriteFlag = prop.getValue( + "chunkServer.writeFlow.lazyCreateOnWrite", + mLazyCreateOnWriteFlag ? 1 : 0) != 0; mFsIdFileNamePrefix = prop.getValue( "chunkServer.fsIdFileNamePrefix", mFsIdFileNamePrefix); mDirCheckerIoTimeoutSec = prop.getValue( @@ -2806,7 +2814,8 @@ ChunkManager::SetBufferedIo(const Properties& props) break; } } - const bool bufferedIoFlag = pit != prefixes.end(); + const bool bufferedIoFlag = + mBufferedIoFlag || (pit != prefixes.end()); if (bufferedIoFlag != it->bufferedIoFlag) { it->bufferedIoFlag = bufferedIoFlag; if (it->availableSpace < 0 && ! it->dirLock) { @@ -3176,7 +3185,7 @@ ChunkManager::AllocChunk( cih->Delete(mChunkInfoLists); return -EFAULT; } - KFS_LOG_STREAM_INFO << "creating chunk: " << MakeChunkPathname(cih) << + KFS_LOG_STREAM_DEBUG << "creating chunk: " << MakeChunkPathname(cih) << KFS_LOG_EOM; int ret = OpenChunk(cih, O_RDWR | O_CREAT); if (ret < 0) { @@ -3320,8 +3329,17 @@ ChunkManager::MakeChunkStable(kfsChunkId_t chunkId, kfsSeq_t chunkVersion, return -EINVAL; } } else if (chunkVersion != cih->chunkInfo.chunkVersion) { - statusMsg = "version mismatch"; - return -EINVAL; + if (! (mLazyCreateOnWriteFlag && ! appendFlag && ! cih->IsStable() && + cih->chunkInfo.chunkVersion == 0 && chunkVersion > 0)) { + statusMsg = "version mismatch"; + return -EINVAL; + } + KFS_LOG_STREAM_INFO << + "make stable lazy dirty chunk version remap:" + " chunk: " << chunkId << + " local: " << cih->chunkInfo.chunkVersion << + " target: " << chunkVersion << + KFS_LOG_EOM; } if (cih->IsBeingReplicated()) { statusMsg = "chunk replication is in progress"; @@ -3598,9 +3616,21 @@ ChunkManager::ReadChunkMetadataDone(ReadChunkMetaOp* op, IOBuffer* dataBuf) " " << op->Show() << KFS_LOG_EOM; } else { + const int64_t lazyDirtyRecoveredSize = + (mLazyCreateOnWriteFlag && ! cih->IsStable() && + cih->chunkInfo.chunkVersion == 0 && + dci.chunkSize == 0 && cih->chunkInfo.chunkSize > 0) ? + cih->chunkInfo.chunkSize : int64_t(-1); cih->chunkInfo.SetChecksums(dci); cih->chunkInfo.chunkFlags = dci.flags; - if (cih->chunkInfo.chunkSize > (int64_t)dci.chunkSize) { + if (0 <= lazyDirtyRecoveredSize) { + KFS_LOG_STREAM_INFO << + "using dirty lazy-created chunk file size:" + " chunk: " << cih->chunkInfo.chunkId << + " size: " << lazyDirtyRecoveredSize << + KFS_LOG_EOM; + cih->chunkInfo.chunkSize = lazyDirtyRecoveredSize; + } else if (cih->chunkInfo.chunkSize > (int64_t)dci.chunkSize) { const int64_t extra = cih->chunkInfo.chunkSize - dci.chunkSize; if (0 <= cih->chunkInfo.chunkVersion) { @@ -4442,7 +4472,7 @@ ChunkManager::OpenChunk(ChunkInfoHandle* cih, int openFlags) cih->UpdateDirStableCount(); } KFS_LOG_STREAM(openFlag ? - MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelINFO) << + MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelDEBUG) << (openFlag ? "open" : "create") << " chunk file: " << fn << " file handle: " << reinterpret_cast(cih->dataFH.get()) << @@ -4540,7 +4570,7 @@ ChunkManager::CloseChunk(ChunkInfoHandle* cih, KfsOp* op /* = 0 */) ! cih->SyncMeta()) { Release(*cih); } else { - KFS_LOG_STREAM_INFO << + KFS_LOG_STREAM_DEBUG << "chunk: " << cih->chunkInfo.chunkId << " version: " << cih->chunkInfo.chunkVersion << " not released on close; might give up lease" << @@ -5659,13 +5689,37 @@ ChunkManager::RemoveDirtyChunks() fileSystemId, ioTimeSec, readFlag)) { - const bool kStableFlag = false; - const bool kForceDeleteFlag = true; - ScheduleCleanup( - *it, fileId, chunkId, chunkVers, - (int64_t)buf.st_size - (int64_t)KFS_CHUNK_HEADER_SIZE, - kStableFlag, kForceDeleteFlag); - InsertLastInFlight(chunkId); + const int64_t dataSize = + (int64_t)buf.st_size - (int64_t)KFS_CHUNK_HEADER_SIZE; + if (mLazyCreateOnWriteFlag) { + const bool kStableFlag = false; + ChunkInfoHandle* const cih = + new ChunkInfoHandle(*it, kStableFlag); + cih->chunkInfo.fileId = fileId; + cih->chunkInfo.chunkId = chunkId; + cih->chunkInfo.chunkVersion = chunkVers; + cih->chunkInfo.chunkSize = max(int64_t(0), dataSize); + if (AddMapping(cih) == cih) { + KFS_LOG_STREAM_INFO << + "preserving dirty lazy-created chunk:" + " file: " << fileId << + " chunk: " << chunkId << + " version: " << chunkVers << + " size: " << cih->chunkInfo.chunkSize << + KFS_LOG_EOM; + } else { + const bool kForceDeleteFlag = true; + const bool kEvacuatedFlag = false; + MakeStale(*cih, kForceDeleteFlag, kEvacuatedFlag); + } + } else { + const bool kStableFlag = false; + const bool kForceDeleteFlag = true; + ScheduleCleanup( + *it, fileId, chunkId, chunkVers, dataSize, + kStableFlag, kForceDeleteFlag); + InsertLastInFlight(chunkId); + } } else { KFS_LOG_STREAM_INFO << "cleaning out dirty chunk: " << name << @@ -6340,9 +6394,37 @@ ChunkManager::AllocateWriteId( const ServerLocation& peerLoc) { const bool kAddObjectBlockMappingFlag = false; - ChunkInfoHandle* const cih = GetChunkInfoHandle( + ChunkInfoHandle* cih = GetChunkInfoHandle( wi->chunkId, wi->chunkVersion, kAddObjectBlockMappingFlag); - if (! cih) { + if (! cih && mLazyCreateOnWriteFlag && ! wi->isForRecordAppend && + 0 <= wi->fileId && 0 <= wi->chunkVersion) { + const bool kIsBeingReplicatedFlag = false; + const bool kMustExistFlag = false; + const int ret = AllocChunk( + wi->fileId, + wi->chunkId, + wi->chunkVersion, + kKfsSTierUndef, + kKfsSTierUndef, + kIsBeingReplicatedFlag, + &cih, + kMustExistFlag + ); + if (ret < 0) { + wi->statusMsg = "lazy chunk create failed"; + wi->status = ret; + } else { + wi->lazyChunkCreatedFlag = true; + KFS_LOG_STREAM_DEBUG << + "lazy chunk create:" + " file: " << wi->fileId << + " chunk: " << wi->chunkId << + " version: " << wi->chunkVersion << + KFS_LOG_EOM; + } + } + if (0 != wi->status) { + } else if (! cih) { wi->statusMsg = "no such chunk"; wi->status = -EBADF; } else if (wi->chunkVersion != cih->chunkInfo.chunkVersion) { diff --git a/src/cc/chunk/ChunkManager.h b/src/cc/chunk/ChunkManager.h index 7bfdaaebd..f55faf3c4 100644 --- a/src/cc/chunk/ChunkManager.h +++ b/src/cc/chunk/ChunkManager.h @@ -120,6 +120,10 @@ class ChunkManager : private ITimeout { }; bool SetParameters(const Properties& prop); + bool IsLazyCreateOnWriteEnabled() const + { return mLazyCreateOnWriteFlag; } + bool IsWritePrepareChecksumVerifySkipped() const + { return mSkipWritePrepareChecksumVerifyFlag; } /// Init function to configure the chunk manager object. bool Init(const vector& chunkDirs, const Properties& prop); @@ -962,6 +966,8 @@ class ChunkManager : private ITimeout { bool mDiskBufferManagerEnabledFlag; bool mForceVerifyDiskReadChecksumFlag; bool mWritePrepareReplyFlag; + bool mSkipWritePrepareChecksumVerifyFlag; + bool mLazyCreateOnWriteFlag; CryptoKeys mCryptoKeys; int64_t mFileSystemId; string mFileSystemIdSuffix; diff --git a/src/cc/chunk/ClientSM.cc b/src/cc/chunk/ClientSM.cc index 6f9e0bc9c..c8469fea0 100644 --- a/src/cc/chunk/ClientSM.cc +++ b/src/cc/chunk/ClientSM.cc @@ -753,7 +753,8 @@ ClientSM::GetWriteOp(KfsOp& op, int align, int numBytes, } if (nAvail < numBytes) { mNetConnection->SetMaxReadAhead(numBytes - nAvail); - SetReceiveContent(numBytes, op.op == CMD_WRITE_PREPARE); + SetReceiveContent(numBytes, op.op == CMD_WRITE_PREPARE && + ! gChunkManager.IsWritePrepareChecksumVerifySkipped()); // we couldn't process the command...so, wait return false; } @@ -944,7 +945,9 @@ ClientSM::HandleClientCmd(IOBuffer& iobuf, int inCmdLen) bufferBytes = 0 <= op->status ? IoRequestBytes(wop->numBytes) : 0; if (GetReceiveByteCount() == (int)wop->numBytes) { wop->receivedChecksum = GetChecksum(); - wop->blocksChecksums.swap(GetBlockChecksums()); + if (! gChunkManager.IsWritePrepareChecksumVerifySkipped()) { + wop->blocksChecksums.swap(GetBlockChecksums()); + } } ReceiveClear(); } else if (op->op == CMD_RECORD_APPEND) { diff --git a/src/cc/chunk/KfsOps.cc b/src/cc/chunk/KfsOps.cc index 78079707a..f9ebcc275 100644 --- a/src/cc/chunk/KfsOps.cc +++ b/src/cc/chunk/KfsOps.cc @@ -862,6 +862,33 @@ KfsOp::GetClientSM() return (clientSMFlag ? static_cast(clnt) : 0); } +bool +WritePrepareOp::Validate() +{ + if (checksumsCnt <= 0) { + checksumsVal.clear(); + return ChunkAccessRequestOp::Validate(); + } + const char* ptr = checksumsVal.mPtr; + const char* const end = ptr + checksumsVal.mLen; + blocksChecksums.clear(); + blocksChecksums.reserve(checksumsCnt); + for (int i = 0; i < checksumsCnt; i++) { + uint32_t cksum = 0; + if (! (initialShortRpcFormatFlag ? + ValueParserT::ParseInt(ptr, end - ptr, cksum) : + ValueParserT::ParseInt(ptr, end - ptr, cksum))) { + return false; + } + blocksChecksums.push_back(cksum); + while (ptr < end && (*ptr & 0xFF) > ' ') { + ++ptr; + } + } + checksumsVal.clear(); + return ChunkAccessRequestOp::Validate(); +} + bool WriteSyncOp::Validate() { @@ -1187,7 +1214,7 @@ WriteOp::HandleWriteDone(int code, void* data) void CloseOp::Execute() { - KFS_LOG_STREAM_INFO << + KFS_LOG_STREAM_DEBUG << "closing" " chunk: " << chunkId << " version: " << chunkVersion << @@ -1199,6 +1226,7 @@ CloseOp::Execute() int64_t writeId = -1; bool needToForward = needToForwardToPeer(shortRpcFormatFlag, servers, numServers, myPos, peerLoc, hasWriteId, writeId); + needToForward = ! noForwardFlag && needToForward; if (chunkVersion < 0 && needToForward && hasWriteId) { status = -EINVAL; statusMsg = "invalid object store file block close"; @@ -1253,8 +1281,12 @@ CloseOp::Execute() waitReadableFlag ? &readMetaFlag : 0 ); if (ret < 0) { - status = ret; - statusMsg = "invalid write or chunk id"; + if (! needAck && ret == -EBADF) { + status = 0; + } else { + status = ret; + statusMsg = "invalid write or chunk id"; + } } if (waitReadableFlag && 0 <= ret) { return; @@ -1329,6 +1361,7 @@ CloseOp::HandleDone(int code, void* data) void AllocChunkOp::Execute() { + debugStartUsec = microseconds(); int myPos = -1; int64_t writeId = -1; ServerLocation peerLoc; @@ -1427,6 +1460,7 @@ AllocChunkOp::HandleChunkAllocDone(int code, void* data) } if (! diskIo) { SET_HANDLER(this, &AllocChunkOp::HandleChunkAllocDone); + debugBeforeAllocUsec = microseconds(); if (appendFlag) { int myPos = -1; int64_t writeId = -1; @@ -1449,9 +1483,11 @@ AllocChunkOp::HandleChunkAllocDone(int code, void* data) this ); } + debugAfterAllocUsec = microseconds(); if (diskIo) { // File create is in progress. This method will be called again // when create / open completes. + debugDiskWaitStartUsec = microseconds(); assert(status == 0); return 0; } @@ -1460,6 +1496,32 @@ AllocChunkOp::HandleChunkAllocDone(int code, void* data) gLeaseClerk.RegisterLease(*this); } } + const int64_t nowUsec = microseconds(); + const int64_t totalUsec = debugStartUsec > 0 ? + nowUsec - debugStartUsec : 0; + if (100000 <= totalUsec) { + KFS_LOG_STREAM_INFO << + "alloc-chunk timing:" + " seq: " << seq << + " file: " << fileId << + " chunk: " << chunkId << + " version: " << chunkVersion << + " status: " << status << + " total-usec: " << totalUsec << + " pre-alloc-usec: " << + (debugBeforeAllocUsec > debugStartUsec ? + debugBeforeAllocUsec - debugStartUsec : 0) << + " alloc-call-usec: " << + (debugAfterAllocUsec > debugBeforeAllocUsec ? + debugAfterAllocUsec - debugBeforeAllocUsec : 0) << + " disk-wait-usec: " << + (debugDiskWaitStartUsec > 0 ? + nowUsec - debugDiskWaitStartUsec : 0) << + " post-alloc-usec: " << + (debugAfterAllocUsec > 0 ? + nowUsec - debugAfterAllocUsec : 0) << + KFS_LOG_EOM; + } diskIo.reset(); Submit(); return 0; @@ -2331,8 +2393,9 @@ WriteIdAllocOp::Execute() int64_t dummyWriteId = -1; int myPos = -1; ServerLocation peerLoc; - const bool needToForward = needToForwardToPeer(shortRpcFormatFlag, + bool needToForward = needToForwardToPeer(shortRpcFormatFlag, servers, numServers, myPos, peerLoc, false, dummyWriteId); + needToForward = ! noForwardFlag && needToForward; if (myPos < 0) { statusMsg = "invalid or missing Servers: field"; status = -EINVAL; @@ -2342,6 +2405,19 @@ WriteIdAllocOp::Execute() const bool writeMaster = myPos == 0; bool allowCSClearTextFlag = chunkAccessTokenValidFlag && (chunkAccessFlags & ChunkAccessToken::kAllowClearTextFlag) != 0; + if (writeMaster && gChunkManager.IsLazyCreateOnWriteEnabled() && + 0 <= leaseId && 0 <= fileId && ! gLeaseClerk.IsLeaseValid( + chunkId, chunkVersion, 0, 0)) { + AllocChunkOp leaseOp; + leaseOp.fileId = fileId; + leaseOp.chunkId = chunkId; + leaseOp.chunkVersion = chunkVersion; + leaseOp.leaseId = leaseId; + leaseOp.appendFlag = false; + leaseOp.allowCSClearTextFlag = allowCSClearTextFlag; + leaseOp.shortRpcFormatFlag = shortRpcFormatFlag; + gLeaseClerk.RegisterLease(leaseOp); + } if (writeMaster && ! gLeaseClerk.IsLeaseValid( chunkId, chunkVersion, &syncReplicationAccess, &allowCSClearTextFlag)) { @@ -2375,6 +2451,8 @@ WriteIdAllocOp::Execute() } if (needToForward) { ForwardToPeer(peerLoc, writeMaster, allowCSClearTextFlag); + } else if (lazyChunkCreatedFlag) { + WriteLazyCreatedChunkMetadata(); } else { ReadChunkMetadata(); } @@ -2427,10 +2505,29 @@ WriteIdAllocOp::HandlePeerReply(int code, void* data) initialShortRpcFormatFlag, peerShortRpcFormatFlag); writePrepareReplyFlag = writePrepareReplyFlag && fwdedOp->writePrepareReplyFlag; + if (lazyChunkCreatedFlag) { + return WriteLazyCreatedChunkMetadata(); + } ReadChunkMetadata(); return 0; } +int +WriteIdAllocOp::WriteLazyCreatedChunkMetadata() +{ + assert(status == 0); + SET_HANDLER(this, &WriteIdAllocOp::Done); + const int ret = gChunkManager.WriteChunkMetadata( + chunkId, chunkVersion, this); + if (0 <= ret) { + return 0; + } + if (0 <= status) { + status = ret; + } + return Done(EVENT_CMD_DONE, this); +} + void WriteIdAllocOp::ReadChunkMetadata() { @@ -2474,7 +2571,7 @@ WriteIdAllocOp::Done(int code, void* data) } } KFS_LOG_STREAM( - status == 0 ? MsgLogger::kLogLevelINFO : MsgLogger::kLogLevelERROR) << + status == 0 ? MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << (status == 0 ? "done: " : "failed: ") << Show() << KFS_LOG_EOM; Submit(); @@ -2489,8 +2586,9 @@ WritePrepareOp::Execute() // check if we need to forward anywhere ServerLocation peerLoc; int myPos = -1; - const bool needToForward = needToForwardToPeer(shortRpcFormatFlag, + bool needToForward = needToForwardToPeer(shortRpcFormatFlag, servers, numServers, myPos, peerLoc, true, writeId); + needToForward = ! noForwardFlag && needToForward; if (myPos < 0) { statusMsg = "invalid or missing Servers: field"; status = -EINVAL; @@ -2542,9 +2640,22 @@ WritePrepareOp::Execute() } if (blocksChecksums.empty()) { - blocksChecksums = ComputeChecksums(&dataBuf, numBytes, &receivedChecksum); + blocksChecksums = ComputeChecksums( + &dataBuf, numBytes, &receivedChecksum); + } else if (! gChunkManager.IsWritePrepareChecksumVerifySkipped()) { + receivedChecksum = ComputeBlockChecksum(&dataBuf, numBytes); + } + if (gChunkManager.IsWritePrepareChecksumVerifySkipped() && + (offset % CHECKSUM_BLOCKSIZE != 0 || + numBytes % CHECKSUM_BLOCKSIZE != 0 || + blocksChecksums.size() != numBytes / CHECKSUM_BLOCKSIZE)) { + statusMsg = "invalid write checksum vector"; + status = -EINVAL; + Done(EVENT_CMD_DONE, this); + return; } - if (receivedChecksum != checksum) { + if (! gChunkManager.IsWritePrepareChecksumVerifySkipped() && + receivedChecksum != checksum) { statusMsg = "checksum mismatch"; KFS_LOG_STREAM_ERROR << "checksum mismatch: sent: " << checksum << @@ -2649,7 +2760,7 @@ WritePrepareOp::Done(int code, void* data) return 0; } KFS_LOG_STREAM( - status >= 0 ? MsgLogger::kLogLevelINFO : MsgLogger::kLogLevelERROR) << + status >= 0 ? MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << (status >= 0 ? "done: " : "failed: ") << Show() << " status: " << status << (statusMsg.empty() ? "" : " msg: ") << statusMsg << @@ -2669,8 +2780,9 @@ WriteSyncOp::Execute() ServerLocation peerLoc; int myPos = -1; // check if we need to forward anywhere - const bool needToForward = needToForwardToPeer(shortRpcFormatFlag, + bool needToForward = needToForwardToPeer(shortRpcFormatFlag, servers, numServers, myPos, peerLoc, true, writeId); + needToForward = ! noForwardFlag && needToForward; if (myPos < 0) { statusMsg = "invalid or missing Servers: field"; status = -EINVAL; @@ -2890,7 +3002,7 @@ WriteSyncOp::Done(int code, void* data) return 0; } KFS_LOG_STREAM( - status >= 0 ? MsgLogger::kLogLevelINFO : MsgLogger::kLogLevelERROR) << + status >= 0 ? MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) << (status >= 0 ? "done: " : "failed: ") << Show() << " status: " << status << (statusMsg.empty() ? "" : " msg: ") << statusMsg << @@ -3598,7 +3710,16 @@ WriteIdAllocOp::Request(ReqOstream& os) os << "Version: " << KFS_VERSION_STR << "\r\n"; } os << - (shortRpcFormatFlag ? "H:" : "Chunk-handle: ") << chunkId << "\r\n" << + (shortRpcFormatFlag ? "H:" : "Chunk-handle: ") << chunkId << "\r\n"; + if (fileId >= 0) { + os << (shortRpcFormatFlag ? "P:" : "File-handle: ") << + fileId << "\r\n"; + } + if (leaseId >= 0) { + os << (shortRpcFormatFlag ? "L:" : "Lease-id: ") << + leaseId << "\r\n"; + } + os << (shortRpcFormatFlag ? "V:" : "Chunk-version: ") << chunkVersion << "\r\n" << (shortRpcFormatFlag ? "O:" : "Offset: ") << offset << "\r\n" << (shortRpcFormatFlag ? "B:" : "Num-bytes: ") << numBytes << "\r\n" << diff --git a/src/cc/chunk/KfsOps.h b/src/cc/chunk/KfsOps.h index 43e2d30a9..de21b5020 100644 --- a/src/cc/chunk/KfsOps.h +++ b/src/cc/chunk/KfsOps.h @@ -510,6 +510,10 @@ struct AllocChunkOp : public KfsOp { int chunkAccessLength; SyncReplicationAccess syncReplicationAccess; DiskIoPtr diskIo; + int64_t debugStartUsec; + int64_t debugBeforeAllocUsec; + int64_t debugAfterAllocUsec; + int64_t debugDiskWaitStartUsec; AllocChunkOp() : KfsOp(CMD_ALLOC_CHUNK), @@ -530,7 +534,11 @@ struct AllocChunkOp : public KfsOp { contentLength(0), chunkAccessLength(0), syncReplicationAccess(), - diskIo() + diskIo(), + debugStartUsec(0), + debugBeforeAllocUsec(0), + debugAfterAllocUsec(0), + debugDiskWaitStartUsec(0) {} void Execute(); // handlers for reading/writing out the chunk meta-data @@ -1012,6 +1020,7 @@ struct RetireOp : public KfsOp { struct CloseOp : public KfsClientChunkOp { uint32_t numServers; // input bool needAck; // input: when set, this RPC is ack'ed + bool noForwardFlag; // input: do not forward to peer bool hasWriteId; // input int64_t masterCommitted; // input StringBufT<256> servers; // input: set of servers on which to chunk is to be closed @@ -1024,6 +1033,7 @@ struct CloseOp : public KfsClientChunkOp { : KfsClientChunkOp(CMD_CLOSE), numServers (0u), needAck (true), + noForwardFlag (false), hasWriteId (false), masterCommitted ((int64_t)-1), servers (), @@ -1036,6 +1046,7 @@ struct CloseOp : public KfsClientChunkOp { : KfsClientChunkOp(CMD_CLOSE), numServers (op.numServers), needAck (op.needAck), + noForwardFlag (op.noForwardFlag), hasWriteId (op.hasWriteId), masterCommitted (op.masterCommitted), servers (op.servers), @@ -1089,6 +1100,7 @@ struct CloseOp : public KfsClientChunkOp { .Def2("Num-servers", "R", &CloseOp::numServers) .Def2("Servers", "S", &CloseOp::servers) .Def2("Need-ack", "A", &CloseOp::needAck, true) + .Def2("No-forward", "NF", &CloseOp::noForwardFlag, false) .Def2("Has-write-id", "W", &CloseOp::hasWriteId, false) .Def2("Master-committed", "M", &CloseOp::masterCommitted, int64_t(-1)) .Def2("C-access-length", "AL", &CloseOp::chunkAccessLength) @@ -1238,6 +1250,8 @@ struct GetRecordAppendOpStatus : public KfsClientChunkOp }; struct WriteIdAllocOp : public ChunkAccessRequestOp { + kfsFileId_t fileId; + int64_t leaseId; kfsSeq_t clientSeq; /* input */ int64_t offset; /* input */ size_t numBytes; /* input */ @@ -1246,15 +1260,19 @@ struct WriteIdAllocOp : public ChunkAccessRequestOp { StringBufT<256> servers; /* input: set of servers on which to write */ WriteIdAllocOp* fwdedOp; /* if we did any fwd'ing, this is the op that tracks it */ bool isForRecordAppend; /* set if the write-id-alloc is for a record append that will follow */ - bool writePrepareReplyFlag; /* write prepare reply supported */ + bool writePrepareReplyFlag; + bool noForwardFlag; bool peerShortRpcFormatFlag; int contentLength; int chunkAccessLength; SyncReplicationAccess syncReplicationAccess; RemoteSyncSMPtr appendPeer; + bool lazyChunkCreatedFlag; WriteIdAllocOp() : ChunkAccessRequestOp(CMD_WRITE_ID_ALLOC), + fileId(-1), + leaseId(-1), clientSeq(-1), offset(0), numBytes(0), @@ -1264,14 +1282,18 @@ struct WriteIdAllocOp : public ChunkAccessRequestOp { fwdedOp(0), isForRecordAppend(false), writePrepareReplyFlag(true), + noForwardFlag(false), peerShortRpcFormatFlag(false), contentLength(0), chunkAccessLength(0), syncReplicationAccess(), - appendPeer() + appendPeer(), + lazyChunkCreatedFlag(false) { SET_HANDLER(this, &WriteIdAllocOp::Done); } WriteIdAllocOp(const WriteIdAllocOp& other) : ChunkAccessRequestOp(CMD_WRITE_ID_ALLOC), + fileId(other.fileId), + leaseId(other.leaseId), clientSeq(other.clientSeq), offset(other.offset), numBytes(other.numBytes), @@ -1280,11 +1302,13 @@ struct WriteIdAllocOp : public ChunkAccessRequestOp { fwdedOp(0), isForRecordAppend(other.isForRecordAppend), writePrepareReplyFlag(other.writePrepareReplyFlag), + noForwardFlag(other.noForwardFlag), peerShortRpcFormatFlag(false), contentLength(other.contentLength), chunkAccessLength(other.chunkAccessLength), syncReplicationAccess(other.syncReplicationAccess), - appendPeer() + appendPeer(), + lazyChunkCreatedFlag(false) { chunkId = other.chunkId; chunkVersion = other.chunkVersion; @@ -1306,6 +1330,7 @@ struct WriteIdAllocOp : public ChunkAccessRequestOp { // write-id alloc op as a hint to page the data back in---writes // are coming. void ReadChunkMetadata(); + int WriteLazyCreatedChunkMetadata(); void ForwardToPeer( const ServerLocation& loc, @@ -1349,6 +1374,8 @@ struct WriteIdAllocOp : public ChunkAccessRequestOp { template static T& ParserDef(T& parser) { return ChunkAccessRequestOp::ParserDef(parser) + .Def2("File-handle", "P", &WriteIdAllocOp::fileId, kfsFileId_t(-1)) + .Def2("Lease-id", "L", &WriteIdAllocOp::leaseId, int64_t(-1)) .Def2("Offset", "O", &WriteIdAllocOp::offset) .Def2("Num-bytes", "B", &WriteIdAllocOp::numBytes) .Def2("Num-servers", "R", &WriteIdAllocOp::numServers) @@ -1356,6 +1383,7 @@ struct WriteIdAllocOp : public ChunkAccessRequestOp { .Def2("For-record-append", "A", &WriteIdAllocOp::isForRecordAppend, false) .Def2("Client-cseq", "Cc", &WriteIdAllocOp::clientSeq) .Def2("Write-prepare-reply", "WR", &WriteIdAllocOp::writePrepareReplyFlag) + .Def2("No-forward", "NF", &WriteIdAllocOp::noForwardFlag, false) .Def2("Content-length", "l", &WriteIdAllocOp::contentLength, 0) .Def2("C-access-length", "AL", &WriteIdAllocOp::chunkAccessLength) ; @@ -1369,6 +1397,7 @@ struct WritePrepareOp : public ChunkAccessRequestOp { uint32_t checksum; /* input: as computed by the sender; 0 means sender didn't send */ StringBufT<256> servers; /* input: set of servers on which to write */ bool replyRequestedFlag; + bool noForwardFlag; int accessFwdLength; int chunkAccessLength; SyncReplicationAccess syncReplicationAccess; @@ -1378,6 +1407,8 @@ struct WritePrepareOp : public ChunkAccessRequestOp { uint32_t numDone; // sub/forwarding ops count BufferManager* devBufMgr; uint32_t receivedChecksum; + int checksumsCnt; + TokenValue checksumsVal; vector blocksChecksums; WritePrepareOp() @@ -1388,6 +1419,7 @@ struct WritePrepareOp : public ChunkAccessRequestOp { checksum(0), servers(), replyRequestedFlag(false), + noForwardFlag(false), accessFwdLength(0), chunkAccessLength(0), syncReplicationAccess(), @@ -1397,6 +1429,8 @@ struct WritePrepareOp : public ChunkAccessRequestOp { numDone(0), devBufMgr(0), receivedChecksum(0), + checksumsCnt(0), + checksumsVal(), blocksChecksums() { SET_HANDLER(this, &WritePrepareOp::Done); } ~WritePrepareOp(); @@ -1407,6 +1441,7 @@ struct WritePrepareOp : public ChunkAccessRequestOp { return syncReplicationAccess.Parse( is, chunkAccessLength, accessFwdLength); } + bool Validate(); void Response(ReqOstream& os); void Execute(); void ForwardToPeer( @@ -1439,7 +1474,10 @@ struct WritePrepareOp : public ChunkAccessRequestOp { .Def2("Num-servers", "R", &WritePrepareOp::numServers) .Def2("Servers", "S", &WritePrepareOp::servers) .Def2("Checksum", "K", &WritePrepareOp::checksum) + .Def2("Checksum-entries", "KC", &WritePrepareOp::checksumsCnt) + .Def2("Checksums", "Ks", &WritePrepareOp::checksumsVal) .Def2("Reply", "RR", &WritePrepareOp::replyRequestedFlag) + .Def2("No-forward", "NF", &WritePrepareOp::noForwardFlag, false) .Def2("Access-fwd-length", "AF", &WritePrepareOp::accessFwdLength, 0) .Def2("C-access-length", "AL", &WritePrepareOp::chunkAccessLength) ; @@ -1583,6 +1621,7 @@ struct WriteSyncOp : public ChunkAccessRequestOp { uint32_t numServers; StringBufT<256> servers; WriteSyncOp* fwdedOp; + bool noForwardFlag; WriteOp* writeOp; // the underlying write that needs to be pushed to disk uint32_t numDone; // if we did forwarding, we wait for // local/remote to be done; otherwise, we only @@ -1650,6 +1689,7 @@ struct WriteSyncOp : public ChunkAccessRequestOp { .Def2("Servers", "S", &WriteSyncOp::servers) .Def2("Checksum-entries", "KC", &WriteSyncOp::checksumsCnt) .Def2("Checksums", "K", &WriteSyncOp::checksumsVal) + .Def2("No-forward", "NF", &WriteSyncOp::noForwardFlag, false) .Def2("Content-length", "l", &WriteSyncOp::contentLength, 0) .Def2("C-access-length", "AL", &WriteSyncOp::chunkAccessLength) ; diff --git a/src/cc/chunk/LeaseClerk.cc b/src/cc/chunk/LeaseClerk.cc index ffee00897..925ab6091 100644 --- a/src/cc/chunk/LeaseClerk.cc +++ b/src/cc/chunk/LeaseClerk.cc @@ -367,7 +367,7 @@ LeaseClerk::RelinquishLease(kfsChunkId_t chunkId, int64_t chunkVersion, const LeaseInfo_t& lease = *it; LeaseRelinquishOp* const op = new LeaseRelinquishOp( chunkId, chunkVersion, lease.leaseId, kWriteLease); - KFS_LOG_STREAM_INFO << + KFS_LOG_STREAM_DEBUG << "sending lease relinquish for:" " chunk: " << chunkId << " version: " << chunkVersion << diff --git a/src/cc/common/BufferedLogWriter.cc b/src/cc/common/BufferedLogWriter.cc index 8846f5d14..528b43c45 100644 --- a/src/cc/common/BufferedLogWriter.cc +++ b/src/cc/common/BufferedLogWriter.cc @@ -316,7 +316,7 @@ class BufferedLogWriter::Impl : public QCRunnable return; } mRunFlag = true; - const int kStackSize = 64 << 10; + const int kStackSize = 256 << 10; mThread.Start(this, kStackSize, 0, QCThread::CpuAffinity(mCpuAffinityIndex)); } diff --git a/src/cc/kfsio/IOBuffer.cc b/src/cc/kfsio/IOBuffer.cc index d0f43cfec..0ee0e9d92 100644 --- a/src/cc/kfsio/IOBuffer.cc +++ b/src/cc/kfsio/IOBuffer.cc @@ -625,6 +625,29 @@ IOBuffer::Append(IOBuffer *ioBuf) return nBytes; } +IOBuffer::BufPos +IOBuffer::AppendShared(const IOBuffer& other) +{ + DebugChecksum(other, other.mByteCount); + BufPos nBytes = 0; + for (BList::const_iterator it = other.mBuf.begin(); + it != other.mBuf.end(); + ++it) { + const BufPos nb = it->BytesConsumable(); + if (nb > 0) { + mBuf.push_back(IOBufferData(*it, + const_cast(it->Consumer()), + const_cast(it->Producer()))); + nBytes += nb; + } + } + assert(mByteCount >= 0); + mByteCount += nBytes; + DebugVerify(); + other.DebugVerify(); + return nBytes; +} + inline IOBuffer::BList::iterator IOBuffer::BeginSpaceAvailable(IOBuffer::BufPos* nBytes /* = 0 */) { diff --git a/src/cc/kfsio/IOBuffer.h b/src/cc/kfsio/IOBuffer.h index d782ca3f9..6353487d9 100644 --- a/src/cc/kfsio/IOBuffer.h +++ b/src/cc/kfsio/IOBuffer.h @@ -283,6 +283,10 @@ class IOBuffer /// Append the contents of ioBuf to this buffer. BufPos Append(IOBuffer *ioBuf); + // Append shared references to the consumable blocks in other. + // This does not modify other and does not copy payload bytes. + BufPos AppendShared(const IOBuffer& other); + /// Move data buffers with space available at the end of ioBuf. /// @param[in] other Buffer from which the available space to move /// @param[in] numBytes # of bytes of available space to be used diff --git a/src/cc/libclient/KfsClient.cc b/src/cc/libclient/KfsClient.cc index fbd70de6e..97980cb77 100644 --- a/src/cc/libclient/KfsClient.cc +++ b/src/cc/libclient/KfsClient.cc @@ -1324,6 +1324,7 @@ class KfsClientImpl::ClientsList vector mGroups; int mDefaultFileAttributeRevalidateTime; unsigned int mDefaultFileAttributeRevalidateScan; + size_t mDefaultMaxFAttrCacheSize; static const Globals& Get() { @@ -1341,8 +1342,9 @@ class KfsClientImpl::ClientsList mEUser(geteuid()), mEGroup(getegid()), mGroups(), - mDefaultFileAttributeRevalidateTime(30), - mDefaultFileAttributeRevalidateScan(64) + mDefaultFileAttributeRevalidateTime(3600), + mDefaultFileAttributeRevalidateScan(64), + mDefaultMaxFAttrCacheSize(262144) { signal(SIGPIPE, SIG_IGN); libkfsio::InitGlobals(); @@ -1389,6 +1391,19 @@ class KfsClientImpl::ClientsList } } } + const char* cacheSzPtr = + getenv("QFS_CLIENT_MAX_FATTR_CACHE_SIZE"); + if (! cacheSzPtr) { + cacheSzPtr = getenv("KFS_CLIENT_MAX_FATTR_CACHE_SIZE"); + } + if (cacheSzPtr) { + char* e = 0; + const long v = strtol(cacheSzPtr, &e, 10); + if (cacheSzPtr < e && (*e & 0xFF) <= ' ') { + mDefaultMaxFAttrCacheSize = (size_t)max( + 16L << 10, v); + } + } } ~Globals() { Instance().Shutdown(); } @@ -1414,6 +1429,16 @@ class KfsClientImpl::ClientsList globals.mDefaultFileAttributeRevalidateTime; client.mFileAttributeRevalidateScan = globals.mDefaultFileAttributeRevalidateScan; + client.mMaxFAttrCacheSize = globals.mDefaultMaxFAttrCacheSize; + client.mLookupRpcCount = 0; + client.mLookupPathCacheQueryCount = 0; + client.mLookupPathCacheHitCount = 0; + client.mLookupPathCacheStaleCount = 0; + client.mLookupPathCacheMissCount = 0; + client.mLookupFidNameCacheQueryCount = 0; + client.mLookupFidNameCacheHitCount = 0; + client.mLookupFidNameCacheStaleCount = 0; + client.mLookupFidNameCacheMissCount = 0; client.mClientId = mNextClientId++; } void RemoveSelf(KfsClientImpl& client) @@ -1526,9 +1551,19 @@ KfsClientImpl::KfsClientImpl( mDeleteClearFattr(0), mFreeFileTableEntires(), mFattrCacheSkipValidateCnt(0), - mFileAttributeRevalidateTime(30), + mFileAttributeRevalidateTime(3600), mFileAttributeRevalidateScan(64), mFAttrCacheGeneration(1), + mMaxFAttrCacheSize(262144), + mLookupRpcCount(0), + mLookupPathCacheQueryCount(0), + mLookupPathCacheHitCount(0), + mLookupPathCacheStaleCount(0), + mLookupPathCacheMissCount(0), + mLookupFidNameCacheQueryCount(0), + mLookupFidNameCacheHitCount(0), + mLookupFidNameCacheStaleCount(0), + mLookupFidNameCacheMissCount(0), mTmpPath(), mTmpAbsPathStr(), mTmpAbsPath(), @@ -2106,10 +2141,11 @@ KfsClientImpl::Mkdir(const char *pathname, kfsMode_t mode) kfsFileId_t parentFid; string dirname; + string path; const bool kInvalidateSubCountsFlag = true; const bool kEnforceLastDirFlag = false; int res = GetPathComponents( - pathname, &parentFid, dirname, 0, + pathname, &parentFid, dirname, &path, kInvalidateSubCountsFlag, kEnforceLastDirFlag); if (res < 0) { return res; @@ -2126,6 +2162,10 @@ KfsClientImpl::Mkdir(const char *pathname, kfsMode_t mode) if (op.status < 0) { return GetOpStatus(op); } + if (0 <= op.fileId) { + CacheCreatedEntry(parentFid, dirname, path, op.fileId, + op.permissions, true); + } time_t now = 0; // assign to suppress compiler warning. if (! op.userName.empty()) { now = time(0); @@ -3528,13 +3568,26 @@ KfsClientImpl::CreateSelf(const char *pathname, int numReplicas, bool exclusive, const bool kInvalidateSubCountsFlag = true; res = GetPathComponents(pathname, &parentFid, filename, &path, kInvalidateSubCountsFlag); - Delete(LookupFAttr(parentFid, filename)); if (res < 0) { KFS_LOG_STREAM_DEBUG << pathname << ": GetPathComponents: " << res << KFS_LOG_EOM; return res; } + return CreateSelfResolved(pathname, parentFid, filename, path, + numReplicas, exclusive, numStripes, numRecoveryStripes, stripeSize, + stripedType, forceTypeFlag, mode, minSTier, maxSTier); +} + +int +KfsClientImpl::CreateSelfResolved(const char *pathname, kfsFileId_t parentFid, + const string& filename, const string& path, int numReplicas, + bool exclusive, int numStripes, int numRecoveryStripes, int stripeSize, + int stripedType, bool forceTypeFlag, kfsMode_t mode, + kfsSTier_t minSTier, kfsSTier_t maxSTier) +{ + assert(mMutex.IsOwned()); + Delete(LookupFAttr(parentFid, filename)); CreateOp op(0, parentFid, filename.c_str(), numReplicas, exclusive, Permissions( mUseOsUserAndGroupFlag ? mEUser : kKfsUserNone, @@ -3572,33 +3625,30 @@ KfsClientImpl::CreateSelf(const char *pathname, int numReplicas, bool exclusive, " striped file type " << op.striperType << " is not supported " << " got: " << op.metaStriperType << KFS_LOG_EOM; - // Cleanup the file. RemoveOp rm(0, parentFid, filename.c_str(), pathname); DoMetaOpWithRetry(&rm); return -ENXIO; } + if (0 <= op.fileId) { + CacheCreatedEntry(parentFid, filename, path, op.fileId, + op.permissions, false); + } - // Do not attempt to re-use possibly existing file table entry. - // If file existed and being written into it is moved into the dumpster by - // the meta server. - // An attempt to re-use the same file table entry would route the ios to the - // previously existed file into newly created one. const int fte = AllocFileTableEntry(parentFid, filename, path); - if (fte < 0) { // XXX Too many open files + if (fte < 0) { KFS_LOG_STREAM_DEBUG << pathname << ": AllocFileTableEntry: " << fte << KFS_LOG_EOM; return fte; } - // make it the same as creat(): equivalent to open(O_CREAT|O_WRONLY|O_TRUNC). FileTableEntry& entry = *mFileTable[fte]; entry.openMode = O_WRONLY; FileAttr& fa = entry.fattr; - fa.Init(false); // is an ordinary file + fa.Init(false); fa.fileId = op.fileId; fa.numReplicas = op.metaNumReplicas; - fa.fileSize = 0; // presently CreateOp always deletes file if exists. + fa.fileSize = 0; fa.minSTier = op.minSTier; fa.maxSTier = op.maxSTier; if (op.metaStriperType != KFS_STRIPED_FILE_TYPE_NONE) { @@ -3619,7 +3669,6 @@ KfsClientImpl::CreateSelf(const char *pathname, int numReplicas, bool exclusive, } UpdateGroupId(op.groupName, fa.group, now); } - // Set optimal io size, like open does. SetOptimalReadAheadSize(entry, mDefaultReadAheadSize); SetOptimalIoBufferSize(entry, mDefaultIoBufferSize); KFS_LOG_STREAM_DEBUG << @@ -4149,7 +4198,15 @@ KfsClientImpl::OpenSelf(const char *pathname, int openMode, int numReplicas, kfsFileId_t parentFid = -1; string filename; string fpath; - const int res = GetPathComponents(pathname, &parentFid, filename, &fpath); + const bool createFastPathFlag = + ! cacheAttributesFlag && (openMode & O_CREAT) != 0 && + (openMode & (O_EXCL | O_TRUNC | O_APPEND)) == 0 && + (openMode & (O_RDWR | O_WRONLY)) != 0; + const bool kInvalidateSubCountsFlag = false; + const bool kEnforceLastDirFlag = true; + const bool kFollowSymLinkFlag = ! createFastPathFlag; + int res = GetPathComponents(pathname, &parentFid, filename, &fpath, + kInvalidateSubCountsFlag, kEnforceLastDirFlag, kFollowSymLinkFlag); if (res < 0) { return res; } @@ -4158,6 +4215,27 @@ KfsClientImpl::OpenSelf(const char *pathname, int openMode, int numReplicas, if (path) { *path = fpath; } + if (createFastPathFlag) { + int cres = KfsClient::ValidateCreateParams( + numReplicas, numStripes, numRecoveryStripes, + stripeSize, stripedType, minSTier, maxSTier); + if (cres < 0) { + return cres; + } + const int fte = CreateSelfResolved(pathname, parentFid, filename, fpath, + numReplicas, true /* exclusive */, numStripes, numRecoveryStripes, + stripeSize, stripedType, false, mode, minSTier, maxSTier); + if (fte >= 0 || fte != -EEXIST) { + return fte; + } + res = GetPathComponents(pathname, &parentFid, filename, &fpath); + if (res < 0) { + return res; + } + if (path) { + *path = fpath; + } + } bool objectStoreTruncateFlag = false; LookupOp op(0, parentFid, filename.c_str()); FAttr* fa = LookupFAttr(parentFid, filename); @@ -4848,6 +4926,7 @@ KfsClientImpl::StartProtocolWorker() return; } KfsProtocolWorker::Parameters params; + params.mUseClientPoolFlag = true; if (mProtocolWorkerAuthCtx.IsEnabled()) { params.mAuthContextPtr = &mProtocolWorkerAuthCtx; } @@ -4881,6 +4960,9 @@ KfsClientImpl::StartProtocolWorker() } params.mUseClientPoolFlag = mConfig.getValue( "client.connectionPool", params.mUseClientPoolFlag ? 1 : 0) != 0; + params.mParallelReplicaWriteFlag = mConfig.getValue( + "client.parallelReplicaWrite", + params.mParallelReplicaWriteFlag ? 1 : 0) != 0; params.mMetaServerNodes = mConfig.getValue( KfsClient::GetMetaServerNodesParamName(), params.mMetaServerNodes); params.mClientRackId = mConfig.getValue( @@ -5648,16 +5730,20 @@ KfsClientImpl::FindFreeFileTableEntry() void KfsClientImpl::ValidateFAttrCache(time_t now, int maxScan) { - FAttr* p; - const time_t expire = now - mFileAttributeRevalidateTime; - int rem = maxScan; - while ((p = FAttrLru::Front(mFAttrLru)) && - (p->validatedTime < expire || - p->generation != mFAttrCacheGeneration)) { - Delete(p); - if (--rem < 0) { - break; + FAttr* p; + int rem = maxScan; + while ((p = FAttrLru::Front(mFAttrLru))) { + const bool expiredFlag = + (0 <= mFileAttributeRevalidateTime && + p->validatedTime < now - mFileAttributeRevalidateTime); + if (p->generation != mFAttrCacheGeneration || expiredFlag) { + Delete(p); + if (--rem < 0) { + break; + } + continue; } + break; } } @@ -5694,9 +5780,8 @@ KfsClientImpl::NewFAttr(kfsFileId_t parentFid, const string& name, mFattrCacheSkipValidateCnt = 0; ValidateFAttrCache(time(0), mFileAttributeRevalidateScan); } - const size_t kMaxInodeCacheSize = 16 << 10; for (size_t sz = mFidNameToFAttrMap.size(); - kMaxInodeCacheSize <= sz; + mMaxFAttrCacheSize <= sz; sz--) { Delete(FAttrLru::Front(mFAttrLru)); } @@ -5861,15 +5946,47 @@ KfsClientImpl::Lookup(kfsFileId_t parentFid, const string& name, assert(! path.empty() && *path.begin() == '/' && name != "." && name != ".."); + mLookupFidNameCacheQueryCount++; fa = LookupFAttr(parentFid, name); if (fa && IsValid(*fa, now)) { + mLookupFidNameCacheHitCount++; UpdatePath(fa, path); return 0; } + if (fa) { + mLookupFidNameCacheStaleCount++; + } else { + mLookupFidNameCacheMissCount++; + } + mLookupRpcCount++; LookupOp op(0, parentFid, name.c_str()); return LookupSelf(op, parentFid, name, fa, now, path); } +void +KfsClientImpl::CacheCreatedEntry( + kfsFileId_t parentFid, + const string& name, + const string& fullPath, + kfsFileId_t fileId, + const Permissions& perms, + bool isDirectory) +{ + if (fileId < 0 || fullPath.empty() || fullPath[0] != '/') { + return; + } + FileAttr attr; + attr.fileId = fileId; + attr.isDirectory = isDirectory; + attr.user = perms.user; + attr.group = perms.group; + attr.mode = perms.mode; + attr.Init(isDirectory); + FAttr* fa = 0; + const time_t now = time(0); + (void)UpdateFattr(parentFid, name, fa, fullPath, attr, now); +} + int KfsClientImpl::LookupSelf(LookupOp& op, kfsFileId_t parentFid, const string& name, @@ -5991,6 +6108,32 @@ KfsClientImpl::GetPathComponents(const char* pathname, kfsFileId_t* parentFid, if (! followSymLinkFlag && lastFlag && noCheckLastDirFlag) { break; } + mLookupPathCacheQueryCount++; + fa = LookupFAttr(npath, static_cast(0)); + if (fa && IsValid(*fa, now)) { + mLookupPathCacheHitCount++; + if (! fa->isDirectory) { + if (lastFlag && noCheckLastDirFlag) { + break; + } + res = -ENOTDIR; + break; + } + if (invalidateSubCountsFlag) { + fa->staleSubCountsFlag = true; + } + *parentFid = fa->fileId; + if (lastFlag) { + break; + } + mTmpPath.push_back(make_pair(*parentFid, i)); + continue; + } + if (fa) { + mLookupPathCacheStaleCount++; + } else { + mLookupPathCacheMissCount++; + } fa = 0; if ((res = Lookup(*parentFid, name, fa, now, npath)) != 0) { if (lastFlag && -ENOENT == res && noCheckLastDirFlag) { @@ -7809,12 +7952,65 @@ KfsClientImpl::GetStats() { QCStMutexLocker l(mMutex); StartProtocolWorker(); - Properties stats = mProtocolWorker->GetStats(); - if (stats.empty()) { - return 0; + Properties workerStats = mProtocolWorker->GetStats(); + Properties* const ret = new Properties(); + if (! workerStats.empty()) { + ret->swap(workerStats); + } + const int64_t lookupTotal = mLookupRpcCount + + mLookupPathCacheHitCount + mLookupFidNameCacheHitCount; + string val; + AppendDecIntToString(val, mLookupRpcCount); + ret->setValue("PathCache.LookupRpc", val); + val.clear(); + AppendDecIntToString(val, mLookupPathCacheQueryCount); + ret->setValue("PathCache.PathQuery", val); + val.clear(); + AppendDecIntToString(val, mLookupPathCacheHitCount); + ret->setValue("PathCache.PathHit", val); + val.clear(); + AppendDecIntToString(val, mLookupPathCacheStaleCount); + ret->setValue("PathCache.PathStale", val); + val.clear(); + AppendDecIntToString(val, mLookupPathCacheMissCount); + ret->setValue("PathCache.PathMiss", val); + val.clear(); + AppendDecIntToString(val, mLookupFidNameCacheQueryCount); + ret->setValue("PathCache.FidNameQuery", val); + val.clear(); + AppendDecIntToString(val, mLookupFidNameCacheHitCount); + ret->setValue("PathCache.FidNameHit", val); + val.clear(); + AppendDecIntToString(val, mLookupFidNameCacheStaleCount); + ret->setValue("PathCache.FidNameStale", val); + val.clear(); + AppendDecIntToString(val, mLookupFidNameCacheMissCount); + ret->setValue("PathCache.FidNameMiss", val); + val.clear(); + AppendDecIntToString(val, lookupTotal); + ret->setValue("PathCache.LookupTotal", val); + val.clear(); + AppendDecIntToString(val, (int64_t)mFidNameToFAttrMap.size()); + ret->setValue("PathCache.FattrEntries", val); + val.clear(); + AppendDecIntToString(val, (int64_t)mPathCache.size()); + ret->setValue("PathCache.PathEntries", val); + val.clear(); + AppendDecIntToString(val, (int64_t)mMaxFAttrCacheSize); + ret->setValue("PathCache.MaxEntries", val); + val.clear(); + AppendDecIntToString(val, (int64_t)mFAttrCacheGeneration); + ret->setValue("PathCache.Generation", val); + val.clear(); + AppendDecIntToString(val, (int64_t)mFileAttributeRevalidateTime); + ret->setValue("PathCache.RevalidateSec", val); + if (0 < lookupTotal) { + char buf[64]; + const double ratio = (double)(mLookupPathCacheHitCount + + mLookupFidNameCacheHitCount) / (double)lookupTotal; + snprintf(buf, sizeof(buf), "%.6f", ratio); + ret->setValue("PathCache.HitRatio", buf); } - Properties* const ret = new Properties(); - ret->swap(stats); return ret; } diff --git a/src/cc/libclient/KfsClientInt.h b/src/cc/libclient/KfsClientInt.h index e50acfc85..17aec32db 100644 --- a/src/cc/libclient/KfsClientInt.h +++ b/src/cc/libclient/KfsClientInt.h @@ -785,6 +785,16 @@ class KfsClientImpl : private KfsNetClient::OpOwner int mFileAttributeRevalidateTime; unsigned int mFileAttributeRevalidateScan; unsigned int mFAttrCacheGeneration; + size_t mMaxFAttrCacheSize; + int64_t mLookupRpcCount; + int64_t mLookupPathCacheQueryCount; + int64_t mLookupPathCacheHitCount; + int64_t mLookupPathCacheStaleCount; + int64_t mLookupPathCacheMissCount; + int64_t mLookupFidNameCacheQueryCount; + int64_t mLookupFidNameCacheHitCount; + int64_t mLookupFidNameCacheStaleCount; + int64_t mLookupFidNameCacheMissCount; TmpPath mTmpPath; string mTmpAbsPathStr; Path mTmpAbsPath; @@ -855,7 +865,8 @@ class KfsClientImpl : private KfsNetClient::OpOwner bool IsValid(const FAttr& fa, time_t now) const { return (fa.generation == mFAttrCacheGeneration && - now <= fa.validatedTime + mFileAttributeRevalidateTime); + (mFileAttributeRevalidateTime < 0 || + now <= fa.validatedTime + mFileAttributeRevalidateTime)); } void Shutdown(); @@ -887,6 +898,11 @@ class KfsClientImpl : private KfsNetClient::OpOwner int CreateSelf(const char *pathname, int numReplicas, bool exclusive, int numStripes, int numRecoveryStripes, int stripeSize, int stripedType, bool forceTypeFlag, kfsMode_t mode, kfsSTier_t minSTier, kfsSTier_t maxSTier); + int CreateSelfResolved(const char *pathname, kfsFileId_t parentFid, + const string& filename, const string& path, int numReplicas, + bool exclusive, int numStripes, int numRecoveryStripes, + int stripeSize, int stripedType, bool forceTypeFlag, kfsMode_t mode, + kfsSTier_t minSTier, kfsSTier_t maxSTier); ssize_t SetReadAheadSize(FileTableEntry& inEntry, size_t inSize, bool optimalFlag = false); ssize_t SetIoBufferSize(FileTableEntry& entry, size_t size, bool optimalFlag = false); ssize_t SetOptimalIoBufferSize(FileTableEntry& entry, size_t size) { @@ -978,6 +994,13 @@ class KfsClientImpl : private KfsNetClient::OpOwner kfsFileId_t parentFid, const string& name, FAttr*& fa, time_t now, const string& path); FAttr* LookupFattr(kfsFileId_t parentFid, const string& name); + void CacheCreatedEntry( + kfsFileId_t parentFid, + const string& name, + const string& fullPath, + kfsFileId_t fileId, + const Permissions& perms, + bool isDirectory); // name -- is the last component of the pathname int AllocFileTableEntry( diff --git a/src/cc/libclient/KfsOps.cc b/src/cc/libclient/KfsOps.cc index 66a7a0b3b..dc63c0dd8 100644 --- a/src/cc/libclient/KfsOps.cc +++ b/src/cc/libclient/KfsOps.cc @@ -614,6 +614,9 @@ CloseOp::Request(ReqOstream& os) chunkVersion << "\r\n" << Access() ; + if (noForwardFlag) { + os << (shortRpcFormatFlag ? "NF:1\r\n" : "No-forward: 1\r\n"); + } if (! writeInfo.empty()) { os << (shortRpcFormatFlag ? "W:1\r\n" : "Has-write-id: 1\r\n") << @@ -663,12 +666,22 @@ WriteIdAllocOp::Request(ReqOstream& os) { os << "WRITE_ID_ALLOC\r\n" << ReqHeaders(*this) << - (shortRpcFormatFlag ? "H:" : "Chunk-handle: ") << chunkId << "\r\n" << + (shortRpcFormatFlag ? "H:" : "Chunk-handle: ") << chunkId << "\r\n"; + if (fileId >= 0) { + os << (shortRpcFormatFlag ? "P:" : "File-handle: ") << + fileId << "\r\n"; + } + if (leaseId >= 0) { + os << (shortRpcFormatFlag ? "L:" : "Lease-id: ") << + leaseId << "\r\n"; + } + os << (shortRpcFormatFlag ? "V:" : "Chunk-version: ") << chunkVersion << "\r\n" << (shortRpcFormatFlag ? "O:" : "Offset: ") << offset << "\r\n" << (shortRpcFormatFlag ? "B:" : "Num-bytes: ") << numBytes << "\r\n" << (shortRpcFormatFlag ? "A:" : "For-record-append: ") << (isForRecordAppend ? 1 : 0) << "\r\n" << + (noForwardFlag ? (shortRpcFormatFlag ? "NF:1\r\n" : "No-forward: 1\r\n") : "") << (shortRpcFormatFlag ? "R:" : "Num-servers: ") << chunkServerLoc.size() << "\r\n" << Access() << @@ -737,10 +750,16 @@ WritePrepareOp::Request(ReqOstream& os) (shortRpcFormatFlag ? "K:" : "Checksum: ") << checksum << "\r\n" << Access() ; + if (noForwardFlag) { + os << (shortRpcFormatFlag ? "NF:1\r\n" : "No-forward: 1\r\n"); + } if (! checksums.empty()) { os << (shortRpcFormatFlag ? "KC:" : "Checksum-entries: ") << checksums.size() << "\r\n" << (shortRpcFormatFlag ? "Ks:" : "Checksums: "); + if (shortRpcFormatFlag) { + os << std::hex; + } for (size_t i = 0; i < checksums.size(); i++) { os << checksums[i] << ' '; } @@ -775,8 +794,14 @@ WriteSyncOp::Request(ReqOstream& os) checksums.size() << "\r\n" << Access() ; + if (noForwardFlag) { + os << (shortRpcFormatFlag ? "NF:1\r\n" : "No-forward: 1\r\n"); + } if (! checksums.empty()) { os << (shortRpcFormatFlag ? "K:" : "Checksums: "); + if (shortRpcFormatFlag) { + os << std::hex; + } for (size_t i = 0; i < checksums.size(); i++) { os << checksums[i] << ' '; } @@ -1413,6 +1438,8 @@ AllocateOp::ParseResponseHeaderSelf(const Properties& prop) if (status < 0) { return; } + leaseId = prop.getValue( + shortRpcFormatFlag ? "L" : "Lease-id", int64_t(-1)); chunkLeaseDuration = prop.getValue( shortRpcFormatFlag ? "LD" : "Lease-duration", int64_t(-1)); if (ParseChunkServerAccess(*this, prop.getValue( diff --git a/src/cc/libclient/KfsOps.h b/src/cc/libclient/KfsOps.h index a89a73c9e..fcd5270a0 100644 --- a/src/cc/libclient/KfsOps.h +++ b/src/cc/libclient/KfsOps.h @@ -1043,6 +1043,7 @@ struct AllocateOp : public KfsOp { bool invalidateAllFlag; bool allowCSClearTextFlag; bool allCSShortRpcFlag; + int64_t leaseId; int64_t chunkLeaseDuration; int64_t chunkServerAccessValidForTime; int64_t chunkServerAccessIssuedTime; @@ -1063,6 +1064,7 @@ struct AllocateOp : public KfsOp { invalidateAllFlag(false), allowCSClearTextFlag(false), allCSShortRpcFlag(false), + leaseId(-1), chunkLeaseDuration(-1), chunkServerAccessValidForTime(0), chunkServerAccessIssuedTime(0), @@ -1084,6 +1086,7 @@ struct AllocateOp : public KfsOp { invalidateAllFlag = false; allowCSClearTextFlag = false; allCSShortRpcFlag = false; + leaseId = -1; chunkLeaseDuration = -1; chunkServerAccessValidForTime = 0; chunkServerAccessIssuedTime = 0; @@ -1188,14 +1191,17 @@ class ShowWriteInfo { struct CloseOp : public ChunkAccessOp { vector chunkServerLoc; vector writeInfo; + bool noForwardFlag; CloseOp(kfsSeq_t s, kfsChunkId_t c) : ChunkAccessOp(CMD_CLOSE, s, c), - writeInfo() + writeInfo(), + noForwardFlag(false) {} CloseOp(kfsSeq_t s, kfsChunkId_t c, const vector& wi) : ChunkAccessOp(CMD_CLOSE, s, c), - writeInfo(wi) + writeInfo(wi), + noForwardFlag(false) {} void Request(ReqOstream& os); virtual ostream& ShowSelf(ostream& os) const { @@ -1262,19 +1268,25 @@ struct ReadOp : public ChunkAccessOp { // op that defines the write that is going to happen struct WriteIdAllocOp : public ChunkAccessOp { + kfsFileId_t fileId; /* input, optional for lazy chunk create */ + int64_t leaseId; /* input, optional for lazy chunk create */ chunkOff_t offset; /* input */ size_t numBytes; /* input */ bool isForRecordAppend; /* set if this is for a record append that is coming */ bool writePrepReplySupportedFlag; + bool noForwardFlag; string writeIdStr; /* output */ vector chunkServerLoc; WriteIdAllocOp(kfsSeq_t s, kfsChunkId_t c, int64_t v, chunkOff_t o, size_t n) : ChunkAccessOp(CMD_WRITE_ID_ALLOC, s, c), + fileId(-1), + leaseId(-1), offset(o), numBytes(n), isForRecordAppend(false), - writePrepReplySupportedFlag(false) + writePrepReplySupportedFlag(false), + noForwardFlag(false) { chunkVersion = v; } void Request(ReqOstream& os); virtual void ParseResponseHeaderSelf(const Properties& prop); @@ -1290,6 +1302,7 @@ struct WritePrepareOp : public ChunkAccessOp { chunkOff_t offset; /* input */ size_t numBytes; /* input */ bool replyRequestedFlag; + bool noForwardFlag; vector checksums; /* checksum for each 64KB block */ vector writeInfo; /* input */ @@ -1298,6 +1311,7 @@ struct WritePrepareOp : public ChunkAccessOp { offset(0), numBytes(0), replyRequestedFlag(false), + noForwardFlag(false), checksums(), writeInfo() { chunkVersion = v; } @@ -1319,6 +1333,7 @@ struct WriteSyncOp : public ChunkAccessOp { chunkOff_t offset; /* input */ size_t numBytes; /* input */ vector writeInfo; + bool noForwardFlag; // The checksums that cover the region. vector checksums; @@ -1326,7 +1341,8 @@ struct WriteSyncOp : public ChunkAccessOp { : ChunkAccessOp(CMD_WRITE_SYNC, 0, 0), offset(0), numBytes(0), - writeInfo() + writeInfo(), + noForwardFlag(false) {} void Request(ReqOstream& os); virtual ostream& ShowSelf(ostream& os) const { diff --git a/src/cc/libclient/KfsProtocolWorker.cc b/src/cc/libclient/KfsProtocolWorker.cc index 4752048f1..07eb3da62 100644 --- a/src/cc/libclient/KfsProtocolWorker.cc +++ b/src/cc/libclient/KfsProtocolWorker.cc @@ -138,6 +138,7 @@ class KfsProtocolWorker::Impl : mStopRequest(), mWorker(this, "KfsProtocolWorker"), mMutex(), + mParallelReplicaWriteFlag(inParameters.mParallelReplicaWriteFlag), mClientPoolPtr(inParameters.mUseClientPoolFlag ? new ClientPool( mNetManager, @@ -154,7 +155,7 @@ class KfsProtocolWorker::Impl : int64_t(std::numeric_limits::max()) ), // inMaxContentLength false, // inFailAllOpsOnOpTimeoutFlag - false, // inMaxOneOutstandingOpFlag + true, // inMaxOneOutstandingOpFlag 0 // inAuthContextPtr ) : 0 ), @@ -1149,7 +1150,9 @@ class KfsProtocolWorker::Impl : min(max(4 << 20, inOwner.mMaxWriteSize), max(inOwner.mMaxWriteSize, inMaxWriteSize)), inLogPrefixPtr, - inOwner.mChunkServerInitialSeqNum + inOwner.mChunkServerInitialSeqNum, + inOwner.mClientPoolPtr, + inOwner.mParallelReplicaWriteFlag ), mCurRequestPtr(0), mAsyncStatus(0) @@ -1716,6 +1719,7 @@ class KfsProtocolWorker::Impl : StopRequest mStopRequest; QCThread mWorker; QCMutex mMutex; + const bool mParallelReplicaWriteFlag; ClientPool* const mClientPoolPtr; FileReader::Stats mReadStats; FileWriter::Stats mWriteStats; diff --git a/src/cc/libclient/KfsProtocolWorker.h b/src/cc/libclient/KfsProtocolWorker.h index 6b8165eb3..9a9784e37 100644 --- a/src/cc/libclient/KfsProtocolWorker.h +++ b/src/cc/libclient/KfsProtocolWorker.h @@ -214,7 +214,8 @@ class KfsProtocolWorker bool inResolverUseOsResolverFlag = false, int inResolverCacheSize = 8 << 10, int inResolverCacheExpiration = -1, - const string& inNodeId = string()) + const string& inNodeId = string(), + bool inParallelReplicaWriteFlag = true) : mMetaMaxRetryCount(inMetaMaxRetryCount), mMetaTimeSecBetweenRetries(inMetaTimeSecBetweenRetries), mMetaOpTimeoutSec(inMetaOpTimeoutSec), @@ -244,7 +245,8 @@ class KfsProtocolWorker mResolverUseOsResolverFlag(inResolverUseOsResolverFlag), mResolverCacheSize(inResolverCacheSize), mResolverCacheExpiration(inResolverCacheExpiration), - mNodeId(inNodeId) + mNodeId(inNodeId), + mParallelReplicaWriteFlag(inParallelReplicaWriteFlag) {} int mMetaMaxRetryCount; int mMetaTimeSecBetweenRetries; @@ -276,6 +278,7 @@ class KfsProtocolWorker int mResolverCacheSize; int mResolverCacheExpiration; string mNodeId; + bool mParallelReplicaWriteFlag; }; KfsProtocolWorker( std::string inMetaHost, diff --git a/src/cc/libclient/WriteAppender.cc b/src/cc/libclient/WriteAppender.cc index 564d72150..4dd35a064 100644 --- a/src/cc/libclient/WriteAppender.cc +++ b/src/cc/libclient/WriteAppender.cc @@ -1041,6 +1041,8 @@ class WriteAppender::Impl : private ITimeout, private KfsNetClient::OpOwner QCASSERT(mAllocOp.chunkId > 0 && ! mAllocOp.chunkServers.empty()); Reset(mWriteIdAllocOp); mWriteIdAllocOp.chunkId = mAllocOp.chunkId; + mWriteIdAllocOp.fileId = mAllocOp.fid; + mWriteIdAllocOp.leaseId = mAllocOp.leaseId; mWriteIdAllocOp.chunkVersion = mAllocOp.chunkVersion; mWriteIdAllocOp.isForRecordAppend = true; mWriteIdAllocOp.chunkServerLoc = mAllocOp.chunkServers; diff --git a/src/cc/libclient/Writer.cc b/src/cc/libclient/Writer.cc index e4a8d6d75..17cadd48f 100644 --- a/src/cc/libclient/Writer.cc +++ b/src/cc/libclient/Writer.cc @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include "kfsio/IOBuffer.h" #include "kfsio/NetManager.h" @@ -47,6 +49,7 @@ #include "KfsOps.h" #include "KfsClient.h" #include "Monitor.h" +#include "ClientPool.h" namespace KFS { @@ -59,6 +62,13 @@ using std::string; using std::ostream; using std::ostringstream; +static int64_t WriterNowUsec() +{ + struct timeval tv; + gettimeofday(&tv, 0); + return int64_t(tv.tv_sec) * 1000000 + int64_t(tv.tv_usec); +} + // Kfs client write state machine implementation. class Writer::Impl : public QCRefCountedObj, @@ -92,7 +102,9 @@ class Writer::Impl : int inIdleTimeoutSec, int inMaxWriteSize, const string& inLogPrefix, - int64_t inChunkServerInitialSeqNum) + int64_t inChunkServerInitialSeqNum, + ClientPool* inClientPoolPtr, + bool inParallelReplicaWriteFlag) : QCRefCountedObj(), ITimeout(), KfsNetClient::OpOwner(), @@ -121,6 +133,8 @@ class Writer::Impl : mOffset(0), mOpenChunkBlockSize(CHUNKSIZE), mChunkServerInitialSeqNum(inChunkServerInitialSeqNum), + mClientPoolPtr(inClientPoolPtr), + mParallelReplicaWriteFlag(inParallelReplicaWriteFlag), mCompletionPtr(inCompletionPtr), mBuffer(), mLogPrefix(inLogPrefix), @@ -130,7 +144,9 @@ class Writer::Impl : mOpStartTime(0), mCompletionDepthCount(0), mStriperProcessCount(0), - mStriperPtr(0) + mStriperPtr(0), + mCloseStartUsec(0), + mSetSizeStartUsec(0) { Writers::Init(mWriters); } int Open( kfsFileId_t inFileId, @@ -214,6 +230,7 @@ class Writer::Impl : return kErrorTryAgain; } mClosingFlag = true; + mCloseStartUsec = WriterNowUsec(); return StartWrite(); } Offset Write( @@ -381,7 +398,29 @@ class Writer::Impl : size_t mBeginBlock; size_t mEndBlock; time_t mOpStartTime; + int64_t mEnqueueUsec; bool mChecksumValidFlag; + struct ParallelReplica + { + WritePrepareOp mPrepareOp; + IOBuffer mBuffer; + KfsNetClient* mClientPtr; + bool mDoneFlag; + ParallelReplica() + : mPrepareOp(0, 0, 0), + mBuffer(), + mClientPtr(0), + mDoneFlag(false) + {} + private: + ParallelReplica(const ParallelReplica&); + ParallelReplica& operator=(const ParallelReplica&); + }; + typedef std::vector ParallelReplicas; + ParallelReplicas mParallelReplicas; + int mParallelDoneCount; + int mParallelStatus; + string mParallelStatusMsg; WriteOp* mPrevPtr[1]; WriteOp* mNextPtr[1]; @@ -393,8 +432,25 @@ class Writer::Impl : mBeginBlock(0), mEndBlock(0), mOpStartTime(0), - mChecksumValidFlag(false) + mEnqueueUsec(0), + mChecksumValidFlag(false), + mParallelReplicas(), + mParallelDoneCount(0), + mParallelStatus(0), + mParallelStatusMsg() { Queue::Init(*this); } + void ClearParallelReplicas() + { + for (ParallelReplicas::iterator it = mParallelReplicas.begin(); + it != mParallelReplicas.end(); + ++it) { + delete *it; + } + mParallelReplicas.clear(); + mParallelDoneCount = 0; + mParallelStatus = 0; + mParallelStatusMsg.clear(); + } void Delete( WriteOp** inListPtr) { @@ -471,7 +527,7 @@ class Writer::Impl : } private: virtual ~WriteOp() - {} + { ClearParallelReplicas(); } WriteOp( const WriteOp& inWriteOp); WriteOp& operator=( @@ -499,6 +555,7 @@ class Writer::Impl : // cancel all pending ops by calling Stop() false // inResetConnectionOnOpTimeoutFlag ), + mChunkServerPtr(0), mErrorCode(0), mRetryCount(0), mPendingCount(0), @@ -509,6 +566,16 @@ class Writer::Impl : mAllocOp(0, 0, ""), mWriteIdAllocOp(0, 0, 0, 0, 0), mCloseOp(0, 0), + mParallelWriteIdReplicas(), + mParallelWriteIdDoneCount(0), + mParallelWriteIdStatus(0), + mParallelWriteIdStatusMsg(), + mParallelWriteIdStr(), + mParallelWritePrepReplySupportedFlag(true), + mParallelCloseReplicas(), + mParallelCloseDoneCount(0), + mParallelCloseStatus(0), + mParallelCloseStatusMsg(), mLastOpPtr(0), mSleepingFlag(false), mClosingFlag(false), @@ -524,7 +591,10 @@ class Writer::Impl : mChunkAccessExpireTime(0), mCSAccessExpireTime(0), mUpdateLeaseOp(0, -1, 0), - mSleepTimer(inOuter.mNetManager, *this) + mSleepTimer(inOuter.mNetManager, *this), + mChunkCloseStartUsec(0), + mWriteIdAllocStartUsec(0), + mAllocateStartUsec(0) { SET_HANDLER(this, &ChunkWriter::EventHandler); Queue::Init(mPendingQueue); @@ -693,7 +763,7 @@ class Writer::Impl : } return; } - mChunkServer.Stop(); + StopChunkServer(); if (mLastOpPtr == &mAllocOp) { mOuter.mMetaServer.Cancel(mLastOpPtr, this); } @@ -713,7 +783,7 @@ class Writer::Impl : // Start from the beginning -- chunk allocation. KFS_LOG_STREAM_DEBUG << mLogPrefix << "write lease expired: " << - mChunkServer.GetServerLocation() << + GetChunkServer().GetServerLocation() << " starting from chunk allocation, pending:" << " queue: " << (Queue::IsEmpty(mPendingQueue) ? "" : "not") << " empty" << @@ -798,9 +868,40 @@ class Writer::Impl : typedef std::bitset ChecksumBlocks; typedef NetManager::Timer Timer; enum { kLeaseRenewTime = LEASE_INTERVAL_SECS / 3 }; + struct ParallelWriteIdReplica + { + WriteIdAllocOp mOp; + KfsNetClient* mClientPtr; + bool mDoneFlag; + ParallelWriteIdReplica() + : mOp(0, 0, 0, 0, 0), + mClientPtr(0), + mDoneFlag(false) + {} + private: + ParallelWriteIdReplica(const ParallelWriteIdReplica&); + ParallelWriteIdReplica& operator=(const ParallelWriteIdReplica&); + }; + typedef std::vector ParallelWriteIdReplicas; + struct ParallelCloseReplica + { + CloseOp mOp; + KfsNetClient* mClientPtr; + bool mDoneFlag; + ParallelCloseReplica() + : mOp(0, 0), + mClientPtr(0), + mDoneFlag(false) + {} + private: + ParallelCloseReplica(const ParallelCloseReplica&); + ParallelCloseReplica& operator=(const ParallelCloseReplica&); + }; + typedef std::vector ParallelCloseReplicas; Impl& mOuter; ChunkServer mChunkServer; + ChunkServer* mChunkServerPtr; int mErrorCode; int mRetryCount; Offset mPendingCount; @@ -811,6 +912,16 @@ class Writer::Impl : AllocateOp mAllocOp; WriteIdAllocOp mWriteIdAllocOp; CloseOp mCloseOp; + ParallelWriteIdReplicas mParallelWriteIdReplicas; + int mParallelWriteIdDoneCount; + int mParallelWriteIdStatus; + string mParallelWriteIdStatusMsg; + string mParallelWriteIdStr; + bool mParallelWritePrepReplySupportedFlag; + ParallelCloseReplicas mParallelCloseReplicas; + int mParallelCloseDoneCount; + int mParallelCloseStatus; + string mParallelCloseStatusMsg; KfsOp* mLastOpPtr; bool mSleepingFlag; bool mClosingFlag; @@ -829,12 +940,43 @@ class Writer::Impl : Timer mSleepTimer; WriteOp* mPendingQueue[1]; WriteOp* mInFlightQueue[1]; + int64_t mChunkCloseStartUsec; + int64_t mWriteIdAllocStartUsec; + int64_t mAllocateStartUsec; ChunkWriter* mPrevPtr[1]; ChunkWriter* mNextPtr[1]; friend class QCDLListOp; typedef QCDLListOp ChunkWritersListOp; + void ClearParallelWriteIdReplicas() + { + for (ParallelWriteIdReplicas::iterator it = + mParallelWriteIdReplicas.begin(); + it != mParallelWriteIdReplicas.end(); + ++it) { + delete *it; + } + mParallelWriteIdReplicas.clear(); + mParallelWriteIdDoneCount = 0; + mParallelWriteIdStatus = 0; + mParallelWriteIdStatusMsg.clear(); + mParallelWriteIdStr.clear(); + mParallelWritePrepReplySupportedFlag = true; + } + void ClearParallelCloseReplicas() + { + for (ParallelCloseReplicas::iterator it = + mParallelCloseReplicas.begin(); + it != mParallelCloseReplicas.end(); + ++it) { + delete *it; + } + mParallelCloseReplicas.clear(); + mParallelCloseDoneCount = 0; + mParallelCloseStatus = 0; + mParallelCloseStatusMsg.clear(); + } void UpdateLeaseExpirationTime() { mLeaseExpireTime = min(mLeaseEndTime, @@ -875,6 +1017,7 @@ class Writer::Impl : mOuter.mStats.mChunkAllocCount++; // Use 5x chunk op timeout for "allocation" that can require // chunk version change. + mAllocateStartUsec = WriterNowUsec(); const int theMetaOpTimeout = mOuter.mMetaServer.GetOpTimeoutSec(); EnqueueMeta(mAllocOp, 0, max(0, max(mOuter.mOpTimeoutSec, 5 * theMetaOpTimeout) - theMetaOpTimeout)); @@ -885,6 +1028,12 @@ class Writer::Impl : IOBuffer* inBufferPtr) { QCASSERT(&mAllocOp == &inOp && ! inBufferPtr); + if (0 < mAllocateStartUsec) { + mOuter.mStats.mAllocateUsec += + WriterNowUsec() - mAllocateStartUsec; + mOuter.mStats.mAllocateCount++; + mAllocateStartUsec = 0; + } if (inCanceledFlag) { return; } @@ -952,11 +1101,175 @@ class Writer::Impl : mAllocOp.invalidateAllFlag ); } + bool CanParallelReplicaChunkOps() const + { + return ( + mOuter.mParallelReplicaWriteFlag && + mOuter.mClientPoolPtr && + mAllocOp.chunkServers.size() > 1 && + mAllocOp.chunkServerAccessToken.empty() && + mAllocOp.chunkAccess.empty() + ); + } + void CopyWriteIdAllocRequest( + WriteIdAllocOp& outOp, + const WriteIdAllocOp& inOp) + { + Reset(outOp); + outOp.chunkId = inOp.chunkId; + outOp.fileId = inOp.fileId; + outOp.leaseId = inOp.leaseId; + outOp.chunkVersion = inOp.chunkVersion; + outOp.isForRecordAppend = inOp.isForRecordAppend; + outOp.chunkServerLoc = inOp.chunkServerLoc; + outOp.offset = inOp.offset; + outOp.numBytes = inOp.numBytes; + outOp.writePrepReplySupportedFlag = false; + outOp.noForwardFlag = true; + } + bool TryParallelWriteIdAlloc() + { + if (! CanParallelReplicaChunkOps()) { + return false; + } + ClearParallelWriteIdReplicas(); + mParallelWriteIdReplicas.reserve(mAllocOp.chunkServers.size()); + for (vector::const_iterator it = + mAllocOp.chunkServers.begin(); + it != mAllocOp.chunkServers.end(); + ++it) { + ParallelWriteIdReplica* const theReplicaPtr = + new ParallelWriteIdReplica(); + CopyWriteIdAllocRequest(theReplicaPtr->mOp, mWriteIdAllocOp); + theReplicaPtr->mClientPtr = &mOuter.mClientPoolPtr->Get( + *it, mAllocOp.allCSShortRpcFlag); + mParallelWriteIdReplicas.push_back(theReplicaPtr); + } + mLastOpPtr = &mWriteIdAllocOp; + mWriteIdAllocStartUsec = WriterNowUsec(); + for (ParallelWriteIdReplicas::iterator it = + mParallelWriteIdReplicas.begin(); + it != mParallelWriteIdReplicas.end(); + ++it) { + EnqueueParallelWriteId(**it); + } + if (mParallelWriteIdDoneCount >= + (int)mParallelWriteIdReplicas.size()) { + DoneParallelWriteIdAlloc(); + } + return true; + } + void EnqueueParallelWriteId( + ParallelWriteIdReplica& inReplica) + { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "+> parallel " << inReplica.mOp.Show() << + KFS_LOG_EOM; + mOuter.mStats.mChunkOpsQueuedCount++; + if (! inReplica.mClientPtr->Enqueue(&inReplica.mOp, this, 0)) { + inReplica.mOp.status = kErrorFault; + inReplica.mDoneFlag = true; + mParallelWriteIdDoneCount++; + if (mParallelWriteIdStatus == 0) { + mParallelWriteIdStatus = kErrorFault; + mParallelWriteIdStatusMsg = + "parallel write id enqueue failure"; + } + } + } + bool DoneParallelWriteIdAlloc( + KfsOp* inOpPtr, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + if (mParallelWriteIdReplicas.empty()) { + return false; + } + for (ParallelWriteIdReplicas::iterator it = + mParallelWriteIdReplicas.begin(); + it != mParallelWriteIdReplicas.end(); + ++it) { + if (&(*it)->mOp == inOpPtr) { + return DoneParallelWriteIdAlloc( + **it, inCanceledFlag, inBufferPtr); + } + } + return false; + } + bool DoneParallelWriteIdAlloc( + ParallelWriteIdReplica& inReplica, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + QCASSERT(! inBufferPtr); + if (inReplica.mDoneFlag) { + return false; + } + inReplica.mDoneFlag = true; + mParallelWriteIdDoneCount++; + if ((inCanceledFlag || inReplica.mOp.status < 0) && + mParallelWriteIdStatus == 0) { + mParallelWriteIdStatus = + inCanceledFlag ? kErrorIo : inReplica.mOp.status; + mParallelWriteIdStatusMsg = + inCanceledFlag ? "parallel write id canceled" : + inReplica.mOp.statusMsg; + } else if (inReplica.mOp.status == 0) { + if (mWriteIdAllocOp.chunkAccessResponse.empty()) { + mWriteIdAllocOp.chunkAccessResponse = + inReplica.mOp.chunkAccessResponse; + mWriteIdAllocOp.chunkServerAccessId = + inReplica.mOp.chunkServerAccessId; + mWriteIdAllocOp.chunkServerAccessKey = + inReplica.mOp.chunkServerAccessKey; + mWriteIdAllocOp.accessResponseIssued = + inReplica.mOp.accessResponseIssued; + mWriteIdAllocOp.accessResponseValidForSec = + inReplica.mOp.accessResponseValidForSec; + } + } + if (mParallelWriteIdDoneCount < + (int)mParallelWriteIdReplicas.size()) { + return true; + } + DoneParallelWriteIdAlloc(); + return true; + } + void DoneParallelWriteIdAlloc() + { + if (mLastOpPtr == &mWriteIdAllocOp) { + mLastOpPtr = 0; + } + mWriteIdAllocOp.shortRpcFormatFlag = mAllocOp.allCSShortRpcFlag; + mWriteIdAllocOp.status = mParallelWriteIdStatus; + mWriteIdAllocOp.statusMsg = mParallelWriteIdStatusMsg; + mParallelWriteIdStr.clear(); + mParallelWritePrepReplySupportedFlag = true; + for (ParallelWriteIdReplicas::const_iterator it = + mParallelWriteIdReplicas.begin(); + it != mParallelWriteIdReplicas.end(); + ++it) { + if (! mParallelWriteIdStr.empty()) { + mParallelWriteIdStr.append(" "); + } + mParallelWriteIdStr.append((*it)->mOp.writeIdStr); + mParallelWritePrepReplySupportedFlag = + mParallelWritePrepReplySupportedFlag && + (*it)->mOp.writePrepReplySupportedFlag; + } + mWriteIdAllocOp.writeIdStr = mParallelWriteIdStr; + mWriteIdAllocOp.writePrepReplySupportedFlag = + mParallelWritePrepReplySupportedFlag; + Done(mWriteIdAllocOp, false, 0); + } void AllocateWriteId() { QCASSERT(mAllocOp.chunkId > 0 && ! mAllocOp.chunkServers.empty()); Reset(mWriteIdAllocOp); + ClearParallelWriteIdReplicas(); mWriteIdAllocOp.chunkId = mAllocOp.chunkId; + mWriteIdAllocOp.fileId = mOuter.mFileId; + mWriteIdAllocOp.leaseId = mAllocOp.leaseId; mWriteIdAllocOp.chunkVersion = mAllocOp.chunkVersion; mWriteIdAllocOp.isForRecordAppend = false; mWriteIdAllocOp.chunkServerLoc = mAllocOp.chunkServers; @@ -964,22 +1277,34 @@ class Writer::Impl : mWriteIdAllocOp.numBytes = 0; mWriteIdAllocOp.writePrepReplySupportedFlag = false; + const ServerLocation& theMaster = mAllocOp.chunkServers.front(); + if (mOuter.mClientPoolPtr) { + mChunkServerPtr = &mOuter.mClientPoolPtr->Get( + theMaster, mAllocOp.allCSShortRpcFlag); + } else { + mChunkServerPtr = 0; + const ServerLocation theCurLoc = mChunkServer.GetServerLocation(); + if (theCurLoc.IsValid() && theCurLoc != theMaster) { + mChunkServer.Stop(); + } + mChunkServer.SetRpcFormat(mAllocOp.allCSShortRpcFlag ? + ChunkServer::kRpcFormatShort : ChunkServer::kRpcFormatLong); + } + const time_t theNow = Now(); mHasSubjectIdFlag = false; mChunkAccess.clear(); const bool theCSClearTextAllowedFlag = mOuter.IsChunkServerClearTextAllowed(); - mChunkServer.SetShutdownSsl( + GetChunkServer().SetShutdownSsl( mAllocOp.allowCSClearTextFlag && theCSClearTextAllowedFlag ); - mChunkServer.SetRpcFormat(mAllocOp.allCSShortRpcFlag ? - ChunkServer::kRpcFormatShort : ChunkServer::kRpcFormatLong); if (mAllocOp.chunkServerAccessToken.empty() || mAllocOp.chunkAccess.empty()) { - mChunkServer.SetKey(0, 0, 0, 0); - mChunkServer.SetAuthContext(0); + GetChunkServer().SetKey(0, 0, 0, 0); + GetChunkServer().SetAuthContext(0); if (! mAllocOp.chunkServerAccessToken.empty()) { mWriteIdAllocOp.status = -EINVAL; mWriteIdAllocOp.statusMsg = "no chunk access"; @@ -995,7 +1320,7 @@ class Writer::Impl : mCSAccessExpireTime = mChunkAccessExpireTime; } } else { - mChunkServer.SetKey( + GetChunkServer().SetKey( mAllocOp.chunkServerAccessToken.data(), mAllocOp.chunkServerAccessToken.size(), mAllocOp.chunkServerAccessKey.GetPtr(), @@ -1019,19 +1344,23 @@ class Writer::Impl : if (mAllocOp.allowCSClearTextFlag && theCSClearTextAllowedFlag && mWriteIdAllocOp.createChunkServerAccessFlag) { - mWriteIdAllocOp.decryptKey = &mChunkServer.GetSessionKey(); + mWriteIdAllocOp.decryptKey = &GetChunkServer().GetSessionKey(); } - if (! mChunkServer.GetAuthContext()) { - mChunkServer.SetAuthContext( + if (! GetChunkServer().GetAuthContext()) { + GetChunkServer().SetAuthContext( mOuter.mMetaServer.GetAuthContext()); } } if (mWriteIdAllocOp.status == 0) { const bool kCancelPendingOpsFlag = true; - if (mChunkServer.SetServer( - mAllocOp.chunkServers[0], + if (mChunkServerPtr || mChunkServer.SetServer( + theMaster, kCancelPendingOpsFlag, &mWriteIdAllocOp.statusMsg)) { + if (TryParallelWriteIdAlloc()) { + return; + } + mWriteIdAllocStartUsec = WriterNowUsec(); Enqueue(mWriteIdAllocOp); return; } @@ -1068,7 +1397,7 @@ class Writer::Impl : } if (0 < inOp.accessResponseValidForSec && ! inOp.chunkServerAccessId.empty()) { - mChunkServer.SetKey( + GetChunkServer().SetKey( inOp.chunkServerAccessId.data(), inOp.chunkServerAccessId.size(), inOp.chunkServerAccessKey.GetPtr(), @@ -1101,8 +1430,8 @@ class Writer::Impl : inOp.subjectId = mWriteIds.front().writeId; } if (inOp.createChunkServerAccessFlag && - mChunkServer.IsShutdownSsl()) { - inOp.decryptKey = &mChunkServer.GetSessionKey(); + GetChunkServer().IsShutdownSsl()) { + inOp.decryptKey = &GetChunkServer().GetSessionKey(); } // Roll forward access time to indicate the request is in flight. // If op fails or times out, then write restarts from write id @@ -1114,12 +1443,28 @@ class Writer::Impl : mCSAccessExpireTime = theNow + LEASE_INTERVAL_SECS * 3 / 2; } } + void SetAccess( + ChunkAccessOp& inOp, + const WriteInfo& inWriteInfo, + bool inCanRequestAccessFlag = true) + { + SetAccess(inOp, inCanRequestAccessFlag); + if (inOp.hasSubjectIdFlag) { + inOp.subjectId = inWriteInfo.writeId; + } + } void Done( WriteIdAllocOp& inOp, bool inCanceledFlag, IOBuffer* inBufferPtr) { QCASSERT(&mWriteIdAllocOp == &inOp && ! inBufferPtr); + if (0 < mWriteIdAllocStartUsec) { + mOuter.mStats.mWriteIdAllocUsec += + WriterNowUsec() - mWriteIdAllocStartUsec; + mOuter.mStats.mWriteIdAllocCount++; + mWriteIdAllocStartUsec = 0; + } mWriteIds.clear(); if (inCanceledFlag) { return; @@ -1224,14 +1569,20 @@ class Writer::Impl : inWriteOp.mWritePrepareOp.replyRequestedFlag ); if (inWriteOp.mWritePrepareOp.replyRequestedFlag) { - if (! inWriteOp.mChecksumValidFlag) { + if (inWriteOp.mWritePrepareOp.checksums.empty()) { + inWriteOp.mWritePrepareOp.checksums = ComputeChecksums( + &inWriteOp.mBuffer, + inWriteOp.mWritePrepareOp.numBytes, + &inWriteOp.mWritePrepareOp.checksum + ); + inWriteOp.mChecksumValidFlag = true; + } else if (! inWriteOp.mChecksumValidFlag) { inWriteOp.mWritePrepareOp.checksum = ComputeBlockChecksum( &inWriteOp.mBuffer, inWriteOp.mWritePrepareOp.numBytes ); inWriteOp.mChecksumValidFlag = true; } - inWriteOp.mWritePrepareOp.checksums.clear(); } else { if (inWriteOp.mWritePrepareOp.checksums.empty()) { inWriteOp.mWritePrepareOp.checksums = ComputeChecksums( @@ -1256,12 +1607,159 @@ class Writer::Impl : SetAccess(inWriteOp.mWriteSyncOp); } inWriteOp.mOpStartTime = Now(); + inWriteOp.mEnqueueUsec = WriterNowUsec(); Queue::Remove(mPendingQueue, inWriteOp); Queue::PushBack(mInFlightQueue, inWriteOp); mOuter.mStats.mOpsWriteCount++; mOuter.mStats.mOpsWriteByteCount += inWriteOp.contentLength; + if (TryParallelReplicaWrite(inWriteOp)) { + return; + } Enqueue(inWriteOp, &inWriteOp.mBuffer); } + bool CanParallelReplicaWrite() const + { + return ( + mOuter.mParallelReplicaWriteFlag && + mOuter.mClientPoolPtr && + mWriteIdAllocOp.writePrepReplySupportedFlag && + mWriteIds.size() > 1 && + mAllocOp.chunkServerAccessToken.empty() && + mAllocOp.chunkAccess.empty() + ); + } + bool TryParallelReplicaWrite( + WriteOp& inWriteOp) + { + if (! CanParallelReplicaWrite()) { + return false; + } + inWriteOp.ClearParallelReplicas(); + inWriteOp.mParallelReplicas.reserve(mWriteIds.size()); + for (WriteIds::const_iterator it = mWriteIds.begin(); + it != mWriteIds.end(); + ++it) { + WriteOp::ParallelReplica* const theReplicaPtr = + new WriteOp::ParallelReplica(); + WritePrepareOp& theOp = theReplicaPtr->mPrepareOp; + Reset(theOp); + theOp.chunkId = inWriteOp.mWritePrepareOp.chunkId; + theOp.chunkVersion = + inWriteOp.mWritePrepareOp.chunkVersion; + theOp.offset = inWriteOp.mWritePrepareOp.offset; + theOp.numBytes = inWriteOp.mWritePrepareOp.numBytes; + theOp.contentLength = inWriteOp.contentLength; + theOp.checksum = inWriteOp.mWritePrepareOp.checksum; + theOp.checksums = inWriteOp.mWritePrepareOp.checksums; + theOp.replyRequestedFlag = true; + theOp.noForwardFlag = true; + theOp.writeInfo = mWriteIds; + SetAccess(theOp, *it, true); + theReplicaPtr->mBuffer.AppendShared(inWriteOp.mBuffer); + theReplicaPtr->mClientPtr = &mOuter.mClientPoolPtr->Get( + it->serverLoc, mAllocOp.allCSShortRpcFlag); + inWriteOp.mParallelReplicas.push_back(theReplicaPtr); + } + for (WriteOp::ParallelReplicas::iterator it = + inWriteOp.mParallelReplicas.begin(); + it != inWriteOp.mParallelReplicas.end(); + ++it) { + EnqueueParallelReplica(inWriteOp, **it); + } + if (inWriteOp.mParallelDoneCount >= + (int)inWriteOp.mParallelReplicas.size()) { + inWriteOp.status = inWriteOp.mParallelStatus; + inWriteOp.statusMsg = inWriteOp.mParallelStatusMsg; + Done(inWriteOp, false, &inWriteOp.mBuffer); + } + return true; + } + void EnqueueParallelReplica( + WriteOp& inWriteOp, + WriteOp::ParallelReplica& inReplica) + { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "+> parallel " << inReplica.mPrepareOp.Show() << + " buffer: " << static_cast(&inReplica.mBuffer) << + "/" << inReplica.mBuffer.BytesConsumable() << + KFS_LOG_EOM; + mOuter.mStats.mChunkOpsQueuedCount++; + if (! inReplica.mClientPtr->Enqueue( + &inReplica.mPrepareOp, + this, + &inReplica.mBuffer)) { + inReplica.mPrepareOp.status = kErrorFault; + inReplica.mDoneFlag = true; + inWriteOp.mParallelDoneCount++; + if (inWriteOp.mParallelStatus == 0) { + inWriteOp.mParallelStatus = kErrorFault; + inWriteOp.mParallelStatusMsg = "parallel write enqueue failure"; + } + } + } + bool DoneParallelReplica( + WriteOp& inWriteOp, + WriteOp::ParallelReplica& inReplica, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + if (inReplica.mDoneFlag) { + return false; + } + QCASSERT(inBufferPtr == &inReplica.mBuffer); + inReplica.mDoneFlag = true; + inWriteOp.mParallelDoneCount++; + if ((inCanceledFlag || inReplica.mPrepareOp.status < 0) && + inWriteOp.mParallelStatus == 0) { + inWriteOp.mParallelStatus = + inCanceledFlag ? kErrorIo : + inReplica.mPrepareOp.status; + inWriteOp.mParallelStatusMsg = + inCanceledFlag ? "parallel write canceled" : + inReplica.mPrepareOp.statusMsg; + } else if (inReplica.mPrepareOp.status == 0 && + inWriteOp.mWritePrepareOp.chunkAccessResponse.empty()) { + inWriteOp.mWritePrepareOp.chunkAccessResponse = + inReplica.mPrepareOp.chunkAccessResponse; + inWriteOp.mWritePrepareOp.chunkServerAccessId = + inReplica.mPrepareOp.chunkServerAccessId; + inWriteOp.mWritePrepareOp.chunkServerAccessKey = + inReplica.mPrepareOp.chunkServerAccessKey; + inWriteOp.mWritePrepareOp.accessResponseIssued = + inReplica.mPrepareOp.accessResponseIssued; + inWriteOp.mWritePrepareOp.accessResponseValidForSec = + inReplica.mPrepareOp.accessResponseValidForSec; + } + if (inWriteOp.mParallelDoneCount < + (int)inWriteOp.mParallelReplicas.size()) { + return true; + } + inWriteOp.status = inWriteOp.mParallelStatus; + inWriteOp.statusMsg = inWriteOp.mParallelStatusMsg; + Done(inWriteOp, false, &inWriteOp.mBuffer); + return true; + } + bool DoneParallelReplica( + KfsOp* inOpPtr, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + Queue::Iterator theIt(mInFlightQueue); + WriteOp* theWriteOpPtr; + while ((theWriteOpPtr = theIt.Next())) { + for (WriteOp::ParallelReplicas::iterator it = + theWriteOpPtr->mParallelReplicas.begin(); + it != theWriteOpPtr->mParallelReplicas.end(); + ++it) { + if (&(*it)->mPrepareOp == inOpPtr) { + return DoneParallelReplica( + *theWriteOpPtr, **it, + inCanceledFlag, inBufferPtr); + } + } + } + return false; + } void Done( WriteOp& inOp, bool inCanceledFlag, @@ -1282,13 +1780,19 @@ class Writer::Impl : Monitor::ReportError( Monitor::kWriteOpError, mOuter.mMetaServer.GetMetaServerLocation(), - mChunkServer.GetServerLocation(), + GetChunkServer().GetServerLocation(), inOp.status); mOpStartTime = inOp.mOpStartTime; HandleError(inOp); } return; } + if (0 < inOp.mEnqueueUsec) { + mOuter.mStats.mChunkWriteUsec += + WriterNowUsec() - inOp.mEnqueueUsec; + mOuter.mStats.mChunkWriteCount++; + inOp.mEnqueueUsec = 0; + } const Offset theOffset = inOp.mWritePrepareOp.offset; const Offset theDoneCount = inOp.mBuffer.BytesConsumable(); QCASSERT( @@ -1349,10 +1853,121 @@ class Writer::Impl : UpdateLeaseExpirationTime(); StartWrite(); } + bool TryParallelCloseChunk() + { + if (! CanParallelReplicaChunkOps() || mCloseOp.chunkVersion < 0 || + mCloseOp.writeInfo.empty()) { + return false; + } + ClearParallelCloseReplicas(); + mParallelCloseReplicas.reserve(mCloseOp.writeInfo.size()); + for (WriteIds::const_iterator it = mCloseOp.writeInfo.begin(); + it != mCloseOp.writeInfo.end(); + ++it) { + ParallelCloseReplica* const theReplicaPtr = + new ParallelCloseReplica(); + CloseOp& theOp = theReplicaPtr->mOp; + Reset(theOp); + theOp.chunkId = mCloseOp.chunkId; + theOp.chunkVersion = mCloseOp.chunkVersion; + theOp.writeInfo = mCloseOp.writeInfo; + theOp.noForwardFlag = true; + SetAccess(theOp, *it, true); + theReplicaPtr->mClientPtr = &mOuter.mClientPoolPtr->Get( + it->serverLoc, mAllocOp.allCSShortRpcFlag); + mParallelCloseReplicas.push_back(theReplicaPtr); + } + mLastOpPtr = &mCloseOp; + mChunkCloseStartUsec = WriterNowUsec(); + for (ParallelCloseReplicas::iterator it = + mParallelCloseReplicas.begin(); + it != mParallelCloseReplicas.end(); + ++it) { + EnqueueParallelClose(**it); + } + if (mParallelCloseDoneCount >= + (int)mParallelCloseReplicas.size()) { + DoneParallelClose(); + } + return true; + } + void EnqueueParallelClose( + ParallelCloseReplica& inReplica) + { + KFS_LOG_STREAM_DEBUG << mLogPrefix << + "+> parallel " << inReplica.mOp.Show() << + KFS_LOG_EOM; + mOuter.mStats.mChunkOpsQueuedCount++; + if (! inReplica.mClientPtr->Enqueue(&inReplica.mOp, this, 0)) { + inReplica.mOp.status = kErrorFault; + inReplica.mDoneFlag = true; + mParallelCloseDoneCount++; + if (mParallelCloseStatus == 0) { + mParallelCloseStatus = kErrorFault; + mParallelCloseStatusMsg = + "parallel close enqueue failure"; + } + } + } + bool DoneParallelClose( + KfsOp* inOpPtr, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + if (mParallelCloseReplicas.empty()) { + return false; + } + for (ParallelCloseReplicas::iterator it = + mParallelCloseReplicas.begin(); + it != mParallelCloseReplicas.end(); + ++it) { + if (&(*it)->mOp == inOpPtr) { + return DoneParallelClose( + **it, inCanceledFlag, inBufferPtr); + } + } + return false; + } + bool DoneParallelClose( + ParallelCloseReplica& inReplica, + bool inCanceledFlag, + IOBuffer* inBufferPtr) + { + QCASSERT(! inBufferPtr); + if (inReplica.mDoneFlag) { + return false; + } + inReplica.mDoneFlag = true; + mParallelCloseDoneCount++; + if ((inCanceledFlag || inReplica.mOp.status < 0) && + mParallelCloseStatus == 0) { + mParallelCloseStatus = + inCanceledFlag ? kErrorIo : inReplica.mOp.status; + mParallelCloseStatusMsg = + inCanceledFlag ? "parallel close canceled" : + inReplica.mOp.statusMsg; + } + if (mParallelCloseDoneCount < + (int)mParallelCloseReplicas.size()) { + return true; + } + DoneParallelClose(); + return true; + } + void DoneParallelClose() + { + if (mLastOpPtr == &mCloseOp) { + mLastOpPtr = 0; + } + mCloseOp.status = mParallelCloseStatus; + mCloseOp.statusMsg = mParallelCloseStatusMsg; + Done(mCloseOp, false, 0); + } void CloseChunk() { QCASSERT(mAllocOp.chunkId > 0); Reset(mCloseOp); + ClearParallelCloseReplicas(); mCloseOp.chunkId = mAllocOp.chunkId; mCloseOp.chunkVersion = mAllocOp.chunkVersion; mCloseOp.writeInfo = mWriteIds; @@ -1362,6 +1977,9 @@ class Writer::Impl : mCloseOp.chunkServerLoc.clear(); } SetAccess(mCloseOp); + if (TryParallelCloseChunk()) { + return; + } if (mCloseOp.chunkVersion < 0) { // Extend timeout to accommodate object commit, possibly single // atomic 64MB "object" write. @@ -1377,10 +1995,11 @@ class Writer::Impl : " version: " << mCloseOp.chunkVersion << " chunk close timeout: " << theTimeout << " sec." << KFS_LOG_EOM; - mChunkServer.SetOpTimeoutSec(theTimeout); + GetChunkServer().SetOpTimeoutSec(theTimeout); } mWriteIds.clear(); mAllocOp.chunkId = -1; + mChunkCloseStartUsec = WriterNowUsec(); Enqueue(mCloseOp); } void Done( @@ -1389,9 +2008,15 @@ class Writer::Impl : IOBuffer* inBufferPtr) { QCASSERT(&mCloseOp == &inOp && ! inBufferPtr); + if (0 < mChunkCloseStartUsec) { + mOuter.mStats.mChunkCloseUsec += + WriterNowUsec() - mChunkCloseStartUsec; + mOuter.mStats.mChunkCloseCount++; + mChunkCloseStartUsec = 0; + } if (mCloseOp.chunkVersion < 0) { // Restore timeout, changed by CloseChunk(). - mChunkServer.SetOpTimeoutSec(mOuter.mOpTimeoutSec); + GetChunkServer().SetOpTimeoutSec(mOuter.mOpTimeoutSec); } if (inCanceledFlag) { return; @@ -1408,7 +2033,10 @@ class Writer::Impl : } mKeepLeaseFlag = false; mCloseOp.chunkId = -1; + const int64_t resetStartUsec = WriterNowUsec(); Reset(); + mOuter.mStats.mChunkResetUsec += WriterNowUsec() - resetStartUsec; + mOuter.mStats.mChunkResetCount++; StartWrite(); } virtual void OpDone( @@ -1449,12 +2077,21 @@ class Writer::Impl : Done(mAllocOp, inCanceledFlag, inBufferPtr); } else if (&mWriteIdAllocOp == inOpPtr) { Done(mWriteIdAllocOp, inCanceledFlag, inBufferPtr); + } else if (DoneParallelWriteIdAlloc( + inOpPtr, inCanceledFlag, inBufferPtr)) { + return; } else if (&mAllocOp == inOpPtr) { Done(mAllocOp, inCanceledFlag, inBufferPtr); } else if (&mCloseOp == inOpPtr) { Done(mCloseOp, inCanceledFlag, inBufferPtr); + } else if (DoneParallelClose( + inOpPtr, inCanceledFlag, inBufferPtr)) { + return; } else if (&mUpdateLeaseOp == inOpPtr) { Done(mUpdateLeaseOp, inCanceledFlag, inBufferPtr); + } else if (DoneParallelReplica( + inOpPtr, inCanceledFlag, inBufferPtr)) { + return; } else if (inOpPtr && inOpPtr->op == CMD_WRITE) { Done(*static_cast(inOpPtr), inCanceledFlag, inBufferPtr); @@ -1462,10 +2099,22 @@ class Writer::Impl : mOuter.InternalError("unexpected operation completion"); } } + void StopChunkServer() + { + if (mChunkServerPtr) { + mChunkServerPtr->CancelAllWithOwner(this); + mChunkServerPtr = 0; + } + mChunkServer.Stop(); + } + ChunkServer& GetChunkServer() + { return (mChunkServerPtr ? *mChunkServerPtr : mChunkServer); } + const ChunkServer& GetChunkServer() const + { return (mChunkServerPtr ? *mChunkServerPtr : mChunkServer); } void Enqueue( KfsOp& inOp, IOBuffer* inBufferPtr = 0) - { EnqueueSelf(inOp, inBufferPtr, &mChunkServer, 0); } + { EnqueueSelf(inOp, inBufferPtr, &GetChunkServer(), 0); } void EnqueueMeta( KfsOp& inOp, IOBuffer* inBufferPtr = 0, @@ -1480,7 +2129,9 @@ class Writer::Impl : mWriteIds.clear(); mAllocOp.chunkId = 0; mLastOpPtr = 0; - mChunkServer.Stop(); + StopChunkServer(); + ClearParallelWriteIdReplicas(); + ClearParallelCloseReplicas(); QCASSERT(Queue::IsEmpty(mInFlightQueue)); if (mSleepingFlag) { mSleepTimer.RemoveTimeout(); @@ -1532,9 +2183,9 @@ class Writer::Impl : " status: " << inOp.status << " msg: " << inOp.statusMsg << " op: " << inOp.Show() << - " current chunk server: " << mChunkServer.GetServerLocation() << - " chunkserver: " << (mChunkServer.IsDataSent() ? - (mChunkServer.IsAllDataSent() ? "all" : "partial") : + " current chunk server: " << GetChunkServer().GetServerLocation() << + " chunkserver: " << (GetChunkServer().IsDataSent() ? + (GetChunkServer().IsAllDataSent() ? "all" : "partial") : "no") << " data sent" << " retry: " << mRetryCount << "\nRequest:\n" << theOStream.str() << @@ -1757,6 +2408,8 @@ class Writer::Impl : Offset mOffset; Offset mOpenChunkBlockSize; int64_t mChunkServerInitialSeqNum; + ClientPool* const mClientPoolPtr; + const bool mParallelReplicaWriteFlag; Completion* mCompletionPtr; IOBuffer mBuffer; string const mLogPrefix; @@ -1768,6 +2421,8 @@ class Writer::Impl : int mCompletionDepthCount; int mStriperProcessCount; Striper* mStriperPtr; + int64_t mCloseStartUsec; + int64_t mSetSizeStartUsec; ChunkWriter* mWriters[1]; void InternalError( @@ -1872,6 +2527,7 @@ class Writer::Impl : mTruncateOp.fid = mFileId; mTruncateOp.fileOffset = theSize; mTruncateOp.status = 0; + mSetSizeStartUsec = WriterNowUsec(); KFS_LOG_STREAM_DEBUG << mLogPrefix << "meta +> " << mTruncateOp.Show() << KFS_LOG_EOM; @@ -1896,6 +2552,11 @@ class Writer::Impl : if (inOpPtr != &mTruncateOp) { return; } + if (0 < mSetSizeStartUsec) { + mStats.mSetSizeUsec += WriterNowUsec() - mSetSizeStartUsec; + mStats.mSetSizeCount++; + mSetSizeStartUsec = 0; + } mTruncateOp.pathname = 0; mTruncateOp.fid = -1; if (inCanceledFlag) { @@ -2121,6 +2782,11 @@ class Writer::Impl : if (mClosingFlag && Writers::IsEmpty(mWriters) && ! mSleepingFlag) { SetFileSize(); if (mTruncateOp.fid < 0 && ! mSleepingFlag) { + if (0 < mCloseStartUsec) { + mStats.mCloseUsec += WriterNowUsec() - mCloseStartUsec; + mStats.mCloseCount++; + mCloseStartUsec = 0; + } mClosingFlag = false; mFileId = -1; Striper* const theStriperPtr = mStriperPtr; @@ -2214,7 +2880,9 @@ Writer::Writer( int inIdleTimeoutSec, int inMaxWriteSize, const char* inLogPrefixPtr, - int64_t inChunkServerInitialSeqNum) + int64_t inChunkServerInitialSeqNum, + ClientPool* inClientPoolPtr, + bool inParallelReplicaWriteFlag) : mImpl(*new Writer::Impl( *this, inMetaServer, @@ -2228,7 +2896,9 @@ Writer::Writer( inMaxWriteSize, (inLogPrefixPtr && inLogPrefixPtr[0]) ? (inLogPrefixPtr + string(" ")) : string(), - inChunkServerInitialSeqNum + inChunkServerInitialSeqNum, + inClientPoolPtr, + inParallelReplicaWriteFlag )) { mImpl.Ref(); diff --git a/src/cc/libclient/Writer.h b/src/cc/libclient/Writer.h index 10d6b2a19..aef754650 100644 --- a/src/cc/libclient/Writer.h +++ b/src/cc/libclient/Writer.h @@ -40,6 +40,8 @@ namespace client { using std::string; +class ClientPool; + // Kfs client write protocol state machine. class Writer { @@ -85,7 +87,21 @@ class Writer mRetriesCount(0), mWriteCount(0), mWriteByteCount(0), - mBufferCompactionCount(0) + mBufferCompactionCount(0), + mCloseCount(0), + mCloseUsec(0), + mSetSizeCount(0), + mSetSizeUsec(0), + mChunkCloseCount(0), + mChunkCloseUsec(0), + mChunkWriteCount(0), + mChunkWriteUsec(0), + mChunkResetCount(0), + mChunkResetUsec(0), + mWriteIdAllocCount(0), + mWriteIdAllocUsec(0), + mAllocateCount(0), + mAllocateUsec(0) {} void Clear() { *this = Stats(); } @@ -104,6 +120,20 @@ class Writer mWriteCount += inStats.mWriteCount; mWriteByteCount += inStats.mWriteByteCount; mBufferCompactionCount += inStats.mBufferCompactionCount; + mCloseCount += inStats.mCloseCount; + mCloseUsec += inStats.mCloseUsec; + mSetSizeCount += inStats.mSetSizeCount; + mSetSizeUsec += inStats.mSetSizeUsec; + mChunkCloseCount += inStats.mChunkCloseCount; + mChunkCloseUsec += inStats.mChunkCloseUsec; + mChunkWriteCount += inStats.mChunkWriteCount; + mChunkWriteUsec += inStats.mChunkWriteUsec; + mChunkResetCount += inStats.mChunkResetCount; + mChunkResetUsec += inStats.mChunkResetUsec; + mWriteIdAllocCount += inStats.mWriteIdAllocCount; + mWriteIdAllocUsec += inStats.mWriteIdAllocUsec; + mAllocateCount += inStats.mAllocateCount; + mAllocateUsec += inStats.mAllocateUsec; return *this; } template @@ -122,6 +152,20 @@ class Writer inFunctor("Retries", mRetriesCount); inFunctor("Writes" , mWriteCount); inFunctor("WriteBytes", mWriteByteCount); + inFunctor("CloseCount", mCloseCount); + inFunctor("CloseUsec", mCloseUsec); + inFunctor("SetSizeCount", mSetSizeCount); + inFunctor("SetSizeUsec", mSetSizeUsec); + inFunctor("ChunkCloseCount", mChunkCloseCount); + inFunctor("ChunkCloseUsec", mChunkCloseUsec); + inFunctor("ChunkWriteCount", mChunkWriteCount); + inFunctor("ChunkWriteUsec", mChunkWriteUsec); + inFunctor("ChunkResetCount", mChunkResetCount); + inFunctor("ChunkResetUsec", mChunkResetUsec); + inFunctor("WriteIdAllocCount", mWriteIdAllocCount); + inFunctor("WriteIdAllocUsec", mWriteIdAllocUsec); + inFunctor("AllocateCount", mAllocateCount); + inFunctor("AllocateUsec", mAllocateUsec); } Counter mMetaOpsQueuedCount; Counter mMetaOpsCancelledCount; @@ -135,6 +179,20 @@ class Writer Counter mWriteCount; Counter mWriteByteCount; Counter mBufferCompactionCount; + Counter mCloseCount; + Counter mCloseUsec; + Counter mSetSizeCount; + Counter mSetSizeUsec; + Counter mChunkCloseCount; + Counter mChunkCloseUsec; + Counter mChunkWriteCount; + Counter mChunkWriteUsec; + Counter mChunkResetCount; + Counter mChunkResetUsec; + Counter mWriteIdAllocCount; + Counter mWriteIdAllocUsec; + Counter mAllocateCount; + Counter mAllocateUsec; }; class Striper { @@ -200,7 +258,9 @@ class Writer int inIdleTimeoutSec, int inMaxWriteSize, const char* inLogPrefixPtr, - int64_t inChunkServerInitialSeqNum); + int64_t inChunkServerInitialSeqNum, + ClientPool* inClientPoolPtr = 0, + bool inParallelReplicaWriteFlag = false); virtual ~Writer(); int Open( kfsFileId_t inFileId, diff --git a/src/cc/meta/CMakeLists.txt b/src/cc/meta/CMakeLists.txt index 734cb4405..77524ab57 100644 --- a/src/cc/meta/CMakeLists.txt +++ b/src/cc/meta/CMakeLists.txt @@ -35,9 +35,11 @@ set (lib_srcs kfsops.cc kfstree.cc LayoutManager.cc + layoutmanager_instance.cc meta.cc MetaRequest.cc NetDispatch.cc + NamespaceV2.cc Replay.cc Restorer.cc util.cc @@ -62,7 +64,7 @@ target_link_libraries(kfsMeta ) if (NOT USE_STATIC_LIB_LINKAGE) - add_library (kfsMeta-shared SHARED ${lib_srcs} layoutmanager_instance.cc) + add_library (kfsMeta-shared SHARED ${lib_srcs}) set_target_properties (kfsMeta-shared PROPERTIES OUTPUT_NAME "qfs_meta") set_target_properties (kfsMeta-shared PROPERTIES CLEAN_DIRECT_OUTPUT 1) target_link_libraries(kfsMeta-shared @@ -114,6 +116,21 @@ if (CMAKE_SYSTEM_NAME STREQUAL "SunOS") target_link_libraries(kfsMeta umem) endif (CMAKE_SYSTEM_NAME STREQUAL "SunOS") +add_executable (namespacev2test namespacev2test_main.cc NamespaceV2.cc) +target_link_libraries(namespacev2test kfsCommon) +add_dependencies(namespacev2test kfsCommon) +qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/devtools" namespacev2test) + +add_executable (namespacev2bench namespacev2bench_main.cc NamespaceV2.cc) +target_link_libraries(namespacev2bench kfsCommon) +add_dependencies(namespacev2bench kfsCommon) +qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/devtools" namespacev2bench) + +add_executable (namespacev2walreplaytest namespacev2walreplaytest_main.cc) +target_link_libraries(namespacev2walreplaytest kfsMeta) +add_dependencies(namespacev2walreplaytest kfsMeta) +qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/devtools" namespacev2walreplaytest) + if (CYGWIN) # Workaround for "too many sections" asm failure by turning on optimization # for compiler to inline for all build types. diff --git a/src/cc/meta/Checkpoint.cc b/src/cc/meta/Checkpoint.cc index c28c811ea..60331d006 100644 --- a/src/cc/meta/Checkpoint.cc +++ b/src/cc/meta/Checkpoint.cc @@ -41,6 +41,7 @@ #include "MetaVrSM.h" #include "MetaVrLogSeq.h" #include "util.h" +#include "NamespaceV2.h" #include "common/MdStream.h" #include "common/FdWriter.h" @@ -169,6 +170,9 @@ Checkpoint::write( if (status == 0 && os) { status = gNetDispatch.CheckpointCryptoKeys(os); } + if (status == 0 && os && NamespaceV2::GetConfig().enabledFlag) { + status = NamespaceV2::GetStore().SaveCheckpointDiskEntry(os); + } if (status == 0) { os << "worm/" << (getWORMMode() ? 1 : 0) << '\n'; os << "time/" << DisplayIsoDateTime() << '\n'; diff --git a/src/cc/meta/ChunkServer.cc b/src/cc/meta/ChunkServer.cc index 7e701eed4..8c13aa51e 100644 --- a/src/cc/meta/ChunkServer.cc +++ b/src/cc/meta/ChunkServer.cc @@ -290,6 +290,7 @@ int ChunkServer::sMakeStableTimeout = 330; int ChunkServer::sReplicationTimeout = 510; int ChunkServer::sRequestTimeout = 600; int ChunkServer::sMetaClientPort = 0; +bool ChunkServer::sSkipChunkAllocateInFlightLogFlag = false; int ChunkServer::sTimedoutExpireTime = 10; size_t ChunkServer::sMaxChunksToEvacuate = 2 << 10; // Max queue size // sHeartbeatInterval * sSrvLoadSamplerSampleCount -- boxcar FIR filter @@ -399,6 +400,9 @@ void ChunkServer::SetParameters(const Properties& prop, int clientPort) sMaxPendingOpsCount = max(8, prop.getValue( "metaServer.chunkServer.maxPendingOpsCount", sMaxPendingOpsCount)); + sSkipChunkAllocateInFlightLogFlag = prop.getValue( + "metaServer.chunkServer.skipChunkAllocateInFlightLog", + sSkipChunkAllocateInFlightLogFlag ? 1 : 0) != 0; if (clientPort > 0) { sMetaClientPort = clientPort; } @@ -2203,6 +2207,30 @@ ChunkServer::HandleReply(IOBuffer* iobuf, int msgLen) op->status = -KfsToSysErrno(-op->status); } op->handleReply(prop); + if (op->op == META_CHUNK_ALLOCATE) { + const int64_t nowUsec = microseconds(); + const int64_t submitUsec = op->submitTime; + const int64_t elapsedUsec = submitUsec > 0 ? + nowUsec - submitUsec : 0; + const MetaChunkAllocate* const allocOp = + static_cast(op); + const int64_t allocWaitUsec = allocOp->req && + allocOp->req->debugAfterLayoutUsec > 0 ? + nowUsec - allocOp->req->debugAfterLayoutUsec : 0; + if (100000 <= allocWaitUsec || 100000 <= elapsedUsec) { + KFS_LOG_STREAM_INFO << GetServerLocation() << + " meta-chunk-allocate reply timing:" + " seq: " << op->opSeqno << + " chunk: " << op->chunkId << + " status: " << op->status << + " submit-to-reply-usec: " << elapsedUsec << + " layout-to-reply-usec: " << allocWaitUsec << + " process-usec: " << op->processTime << + " msg-len: " << msgLen << + " recursion: " << mRecursionCount << + KFS_LOG_EOM; + } + } KFS_LOG_STREAM_DEBUG << GetServerLocation() << " cs-reply:" " -seq: " << op->opSeqno << @@ -2663,7 +2691,9 @@ ChunkServer::Enqueue(MetaChunkRequest& req, req.inFlightIt = sChunkOpsInFlight.insert( make_pair(chunkIdInFlight, &req)); } - if (! req.replayFlag) { + if (! req.replayFlag && + (! sSkipChunkAllocateInFlightLogFlag || + META_CHUNK_ALLOCATE != req.op)) { mLogInFlightCount++; if (MetaChunkLogInFlight::Log(req, timeout, removeReplicaFlag)) { return; diff --git a/src/cc/meta/ChunkServer.h b/src/cc/meta/ChunkServer.h index 9abd51380..a5dd5d80c 100644 --- a/src/cc/meta/ChunkServer.h +++ b/src/cc/meta/ChunkServer.h @@ -1060,6 +1060,7 @@ class ChunkServer : static int sReplicationTimeout; static int sRequestTimeout; static int sMetaClientPort; + static bool sSkipChunkAllocateInFlightLogFlag; static bool sRestartCSOnInvalidClusterKeyFlag; static int sSrvLoadSamplerSampleCount; static size_t sMaxChunksToEvacuate; diff --git a/src/cc/meta/ClientManager.h b/src/cc/meta/ClientManager.h index 54fbd390c..1d8043dc1 100644 --- a/src/cc/meta/ClientManager.h +++ b/src/cc/meta/ClientManager.h @@ -71,6 +71,10 @@ class ClientManager } return EnqueueSelf(thread, op); } + void EnqueueBatch( + ClientThread* thread, + MetaRequest* const* reqs, + size_t count); static void SubmitRequest(ClientThread* thread, MetaRequest& op) { if (thread) { diff --git a/src/cc/meta/ClientSM.h b/src/cc/meta/ClientSM.h index ce040ecf6..a4eba9356 100644 --- a/src/cc/meta/ClientSM.h +++ b/src/cc/meta/ClientSM.h @@ -105,6 +105,8 @@ class ClientSM : bool Handle(MetaAllocate& op); int& GetLogQueueCounter() { return mLogQueueCounter; } + ClientManager::ClientThread* GetClientThread() const + { return mClientThread; } private: /// A handle to a network connection NetConnectionPtr mNetConnection; diff --git a/src/cc/meta/LayoutManager.cc b/src/cc/meta/LayoutManager.cc index 130746c14..aee36b7bd 100644 --- a/src/cc/meta/LayoutManager.cc +++ b/src/cc/meta/LayoutManager.cc @@ -32,6 +32,7 @@ #include "ClientSM.h" #include "NetDispatch.h" #include "LogWriter.h" +#include "NamespaceV2.h" #include "qcdio/QCIoBufferPool.h" #include "qcdio/QCUtils.h" @@ -2059,6 +2060,7 @@ LayoutManager::LayoutManager() mConcurrentWritesPerNodeWatermark(10), mMaxSpaceUtilizationThreshold(0.95), mUseFsTotalSpaceFlag(true), + mHdfsLikeAllocateFlag(false), mChunkAllocMinAvailSpace(2 * (int64_t)CHUNKSIZE), mCompleteReplicationCheckInterval(30 * kSecs2MicroSecs), mCompleteReplicationCheckTime( @@ -2392,6 +2394,9 @@ LayoutManager::SetParameters(const Properties& props, int clientPort) mUseFsTotalSpaceFlag = props.getValue( "metaServer.useFsTotalSpace", mUseFsTotalSpaceFlag ? 1 : 0) != 0; + mHdfsLikeAllocateFlag = props.getValue( + "metaServer.writeFlow.hdfsLikeAllocate", + mHdfsLikeAllocateFlag ? 1 : 0) != 0; mChunkAllocMinAvailSpace = props.getValue( "metaServer.chunkAllocMinAvailSpace", mChunkAllocMinAvailSpace); @@ -2483,6 +2488,7 @@ LayoutManager::SetParameters(const Properties& props, int clientPort) mChunkReplicator.GetTimeoutInterval() * 1e-3) * 1e3)); mCheckpoint.GetOp().SetParameters(props); + NamespaceV2::SetParameters(props); mCSCountersUpdateInterval = props.getValue( "metaServer.CSCountersUpdateInterval", @@ -5034,6 +5040,17 @@ LayoutManager::AddNotStableChunk( return "chunk was open for append"; } const seq_t curChunkVersion = pinfo.GetChunkInfo()->chunkVersion; + if (mHdfsLikeAllocateFlag && ! appendFlag && + chunkVersion == 0 && curChunkVersion > 0) { + KFS_LOG_STREAM_INFO << logPrefix << + " not stable chunk:" + " <" << fileId << + "," << chunkId << ">" << + " remapping dirty version 0 to current version " << + curChunkVersion << + KFS_LOG_EOM; + chunkVersion = curChunkVersion; + } if (chunkVersion < curChunkVersion) { return "lower chunk version"; } @@ -7158,6 +7175,19 @@ LayoutManager::AllocateChunk( if (req.appendChunk) { mARAChunkCache.RequestNew(req); } + if (mHdfsLikeAllocateFlag && ! req.appendChunk && + ! req.stripedFileFlag && 0 < req.numReplicas && + 0 <= req.chunkVersion) { + KFS_LOG_STREAM_DEBUG << + "hdfs-like allocate: deferred chunk create" + " fid: " << req.fid << + " chunk: " << req.chunkId << + " version: " << req.chunkVersion << + " replicas: " << req.servers.size() << + KFS_LOG_EOM; + req.LayoutDone(0); + return 0; + } for (size_t i = req.servers.size(); i-- > 0; ) { req.servers[i]->AllocateChunk(req, i == 0 ? req.leaseId : -1, tiers[i]); } @@ -10124,7 +10154,7 @@ LayoutManager::MakeChunkStableInit( KFS_LOG_EOM; return; } - KFS_LOG_STREAM_INFO << logPrefix << + KFS_LOG_STREAM_DEBUG << logPrefix << " <" << fid << "," << chunkId << ">" " name: " << pathname << " version: " << chunkVersion << @@ -10486,7 +10516,7 @@ LayoutManager::LogMakeChunkStableDone(MetaLogMakeChunkStable& req) info.serverAddedFlag = false; info.chunkSize = req.chunkSize; info.chunkChecksum = req.chunkChecksum; - KFS_LOG_STREAM_INFO << logPrefix << + KFS_LOG_STREAM_DEBUG << logPrefix << " <" << req.fid << "," << req.chunkId << ">" " starting MCS" " version: " << req.chunkVersion << @@ -10514,6 +10544,86 @@ LayoutManager::LogMakeChunkStableDone(MetaLogMakeChunkStable& req) )); } +bool +LayoutManager::ScheduleTruncateToLastRecoverableChunk( + fid_t fid, + chunkId_t chunkId, + chunkOff_t chunkSize) +{ + if (! mHdfsLikeAllocateFlag || ! mPrimaryFlag || fid < 0) { + return false; + } + StTmp > cinfoTmp(mChunkInfosTmp); + vector& chunks = cinfoTmp.Get(); + MetaFattr* fa = 0; + const int status = metatree.getalloc(fid, fa, chunks, 0); + if (status != 0 || ! fa || KFS_FILE != fa->type || fa->IsSymLink() || + fa->IsStriped() || fa->numReplicas <= 0 || chunks.empty()) { + return false; + } + bool sawUnrecoverableSuffixFlag = false; + chunkOff_t truncateOffset = -1; + chunkId_t lastRecoverableChunkId = -1; + StTmp serversTmp(mServers3Tmp); + Servers& servers = serversTmp.Get(); + for (vector::const_reverse_iterator it = chunks.rbegin(); + it != chunks.rend(); + ++it) { + MetaChunkInfo* const ci = *it; + CSMap::Entry* const entry = mChunkToServerMap.Find(ci->chunkId); + bool recoverableFlag = false; + if (entry) { + servers.clear(); + mChunkToServerMap.GetServers(*entry, servers); + for (Servers::const_iterator si = servers.begin(); + si != servers.end(); + ++si) { + if ((*si)->IsConnected()) { + recoverableFlag = true; + break; + } + } + } + if (! recoverableFlag) { + sawUnrecoverableSuffixFlag = true; + continue; + } + if (! sawUnrecoverableSuffixFlag) { + return false; + } + truncateOffset = ci->offset + (chunkOff_t)CHUNKSIZE; + if (ci->chunkId == chunkId && 0 <= chunkSize && + chunkSize < (chunkOff_t)CHUNKSIZE) { + truncateOffset = ci->offset + chunkSize; + } + lastRecoverableChunkId = ci->chunkId; + break; + } + if (! sawUnrecoverableSuffixFlag) { + return false; + } + if (truncateOffset < 0) { + truncateOffset = 0; + } + if (fa->nextChunkOffset() <= truncateOffset) { + return false; + } + MetaTruncate& op = *(new MetaTruncate()); + op.fid = fid; + op.offset = truncateOffset; + op.setEofHintFlag = true; + KFS_LOG_STREAM_INFO << + "scheduling hdfs-like recovery truncate:" + " fid: " << fid << + " offset: " << truncateOffset << + " next: " << fa->nextChunkOffset() << + " trigger: " << chunkId << + " recoverable: " << lastRecoverableChunkId << + KFS_LOG_EOM; + submit_request(&op); + return true; +} + void LayoutManager::MakeChunkStableDone(const MetaChunkMakeStable& req) { @@ -10704,7 +10814,7 @@ LayoutManager::MakeChunkStableDone(const MetaChunkMakeStable& req) CancelPendingMakeStable(fileId, req.chunkId); } } - KFS_LOG_STREAM_INFO << logPrefix << + KFS_LOG_STREAM_DEBUG << logPrefix << " <" << req.fid << "," << req.chunkId << ">" " fid: " << fileId << " version: " << req.chunkVersion << @@ -10716,6 +10826,16 @@ LayoutManager::MakeChunkStableDone(const MetaChunkMakeStable& req) " down: " << numDownServers << " server(s)" << KFS_LOG_EOM; + if (updateSizeFlag && + numServers > 0 && + fa->filesize < 0 && + ! fa->IsStriped() && + pinfo->GetChunkInfo()->offset + + (chunkOff_t)CHUNKSIZE < fa->nextChunkOffset() && + ScheduleTruncateToLastRecoverableChunk( + fileId, req.chunkId, req.chunkSize)) { + return; + } if (! updateSizeFlag || numServers <= 0 || fa->filesize >= 0 || diff --git a/src/cc/meta/LayoutManager.h b/src/cc/meta/LayoutManager.h index 2da315e69..e04e878ae 100644 --- a/src/cc/meta/LayoutManager.h +++ b/src/cc/meta/LayoutManager.h @@ -1110,6 +1110,10 @@ class LayoutManager : public ITimeout void BeginMakeChunkStableDone(const MetaBeginMakeChunkStable& req); void LogMakeChunkStableDone(MetaLogMakeChunkStable& req); void MakeChunkStableDone(const MetaChunkMakeStable& req); + bool ScheduleTruncateToLastRecoverableChunk( + fid_t fid, + chunkId_t chunkId, + chunkOff_t chunkSize); void Handle(MetaLogMakeChunkStableDone& req); void ReplayPendingMakeStable( chunkId_t chunkId, @@ -2369,6 +2373,7 @@ class LayoutManager : public ITimeout double mMaxSpaceUtilizationThreshold; bool mUseFsTotalSpaceFlag; + bool mHdfsLikeAllocateFlag; int64_t mChunkAllocMinAvailSpace; int64_t mCompleteReplicationCheckInterval; diff --git a/src/cc/meta/LogWriter.cc b/src/cc/meta/LogWriter.cc index f3899374c..4f0a72c42 100644 --- a/src/cc/meta/LogWriter.cc +++ b/src/cc/meta/LogWriter.cc @@ -28,6 +28,7 @@ #include "LogWriter.h" #include "LogTransmitter.h" #include "MetaRequest.h" +#include "NetDispatch.h" #include "MetaDataStore.h" #include "MetaVrSM.h" #include "MetaVrLogSeq.h" @@ -47,6 +48,7 @@ #include "kfsio/NetManager.h" #include "kfsio/ITimeout.h" #include "kfsio/checksum.h" +#include "kfsio/Base64.h" #include "kfsio/PrngIsaac64.h" #include "kfsio/NetErrorSimulator.h" #include "kfsio/NetManagerWatcher.h" @@ -1322,6 +1324,236 @@ class LogWriter::Impl : mLogAvgUsecsNextTimeUsec += kLogAvgIntervalUsec; } } + void SubmitDoneRequest( + MetaRequest& inReq, + int64_t inStartTime, + bool& ioFirstItemFlag) + { + const int64_t theUsecsNow = ioFirstItemFlag ? + inStartTime : microseconds(); + ioFirstItemFlag = false; + if (META_LOG_WRITER_CONTROL != inReq.op) { + if (0 == inReq.status) { + mLogTimeUsec += inStartTime - inReq.submitTime; + mLogTimeOpsCount++; + } else { + mLogErrorOpsCount++; + } + } + inReq.Submit(theUsecsNow); + } + void SubmitDoneBatch( + vector& inBatch, + int64_t inStartTime, + bool& ioFirstItemFlag) + { + if (inBatch.empty()) { + return; + } + const int64_t theUsecsNow = ioFirstItemFlag ? + inStartTime : microseconds(); + ioFirstItemFlag = false; + for (vector::iterator it = inBatch.begin(); + it != inBatch.end(); + ++it) { + MetaRequest& req = **it; + if (META_LOG_WRITER_CONTROL != req.op) { + if (0 == req.status) { + mLogTimeUsec += inStartTime - req.submitTime; + mLogTimeOpsCount++; + } else { + mLogErrorOpsCount++; + } + } + (void)theUsecsNow; + if (req.commitPendingFlag) { + RequestCommitted(req, fileID.getseed()); + } + } + gNetDispatch.DispatchBatch( + &inBatch.front(), inBatch.size()); + } + bool IsNamespaceV2CreateIdsLoggable( + const MetaRequest& inReq) const + { + return inReq.NeedsNamespaceV2CreateIds() && + ((MetaRequest::kLogIfOk == inReq.logAction && + 0 == inReq.status) || + MetaRequest::kLogAlways == inReq.logAction); + } + void ReserveNamespaceV2CreateIdsBatch( + MetaRequest& inReq) + { + if (! IsNamespaceV2CreateIdsLoggable(inReq)) { + return; + } + size_t count = 0; + for (MetaRequest* ptr = &inReq; ptr && + count < (size_t)mMaxBlockSize && + IsNamespaceV2CreateIdsLoggable(*ptr); + ptr = ptr->next) { + ++count; + } + fid_t firstFid = -1; + uint64_t firstTxnId = 0; + MetaRequest::ReserveNamespaceV2CreateIdsBatch( + count, firstFid, firstTxnId); + for (MetaRequest* ptr = &inReq; count > 0; + ptr = ptr->next, --count, ++firstFid, ++firstTxnId) { + if (! ptr || ! ptr->SetNamespaceV2CreateIds( + firstFid, firstTxnId)) { + panic("namespace v2 create id batch reserve failed"); + } + } + } + void FlushNamespaceV2Batch( + vector& inBatch, + int64_t inStartTime, + bool& ioFirstItemFlag) + { + if (inBatch.empty()) { + return; + } + uint64_t firstTxnId = 0; + uint64_t lastTxnId = 0; + for (vector::iterator it = inBatch.begin(); + it != inBatch.end(); + ++it) { + MetaRequest& req = **it; + req.ApplyNamespaceV2Batch(false); + if (firstTxnId == 0) { + firstTxnId = req.GetNamespaceV2BatchTxnId(); + } + lastTxnId = req.GetNamespaceV2BatchTxnId(); + } + MetaRequest::CommitNamespaceV2Batch(firstTxnId, lastTxnId); + SubmitDoneBatch(inBatch, inStartTime, ioFirstItemFlag); + inBatch.clear(); + } + + enum { kNamespaceV2WalBatchMaxCount = 64 }; + + bool IsNamespaceV2WalBatchable( + const MetaRequest& inReq) const + { + // Only batch create / mkdir with pre-reserved namespace v2 ids. + // Other ops keep the per-record format. + return IsNamespaceV2CreateIdsLoggable(inReq) && + (META_CREATE == inReq.op || META_MKDIR == inReq.op); + } + + template + static void AppendLe(std::string& out, T v) + { + for (size_t i = 0; i < sizeof(T); i++) { + out.push_back((char)((uint64_t)v >> (i * 8))); + } + } + + static void AppendBytes(std::string& out, const char* data, size_t len) + { + out.append(data, len); + } + + bool WriteNamespaceV2WalBatchRecord( + ostream& os, + const vector& batch) const + { + if (batch.empty()) { + return true; + } + // Record format: nv2batch/c//b/ + // The payload is raw bytes in little-endian encoding, base64 encoded. + std::string payload; + payload.reserve(batch.size() * 64); + for (vector::const_iterator it = batch.begin(); + it != batch.end(); + ++it) { + const MetaRequest& req = **it; + if (META_CREATE == req.op) { + const MetaCreate& c = static_cast(req); + AppendLe(payload, 1); // create file + AppendLe(payload, (int64_t)c.dir); + AppendLe(payload, (int64_t)c.fid); + AppendLe(payload, c.namespaceV2TxnId); + AppendLe(payload, (uint32_t)c.user); + AppendLe(payload, (uint32_t)c.group); + AppendLe(payload, (uint16_t)c.mode); + AppendLe(payload, c.numReplicas); + AppendLe(payload, c.mtime); + const uint16_t nlen = (uint16_t)std::min( + 0xFFFFu, c.name.size()); + AppendLe(payload, nlen); + AppendBytes(payload, c.name.data(), nlen); + } else if (META_MKDIR == req.op) { + const MetaMkdir& m = static_cast(req); + AppendLe(payload, 2); // mkdir dir + AppendLe(payload, (int64_t)m.dir); + AppendLe(payload, (int64_t)m.fid); + AppendLe(payload, m.namespaceV2TxnId); + AppendLe(payload, (uint32_t)m.user); + AppendLe(payload, (uint32_t)m.group); + AppendLe(payload, (uint16_t)m.mode); + AppendLe(payload, (int16_t)0); + AppendLe(payload, m.mtime); + const uint16_t nlen = (uint16_t)std::min( + 0xFFFFu, m.name.size()); + AppendLe(payload, nlen); + AppendBytes(payload, m.name.data(), nlen); + } else { + return false; + } + } + StBufferT b64Buf; + char* const bufPtr = b64Buf.Resize( + Base64::GetEncodedMaxBufSize((int)payload.size())); + const int b64Len = Base64::Encode( + payload.data(), (int)payload.size(), bufPtr, true); + if (b64Len <= 0) { + return false; + } + // Bound the record length to avoid exceeding block bytes. + const size_t kOverhead = 64; + if ((size_t)b64Len + kOverhead > (size_t)std::max(0, mMaxBlockBytes)) { + return false; + } + os << "nv2batch/c/" << batch.size() << "/b/" << + std::string(bufPtr, b64Len) << "\n"; + return bool(os); + } + + bool WriteNamespaceV2WalBatchContRecord( + ostream& os) const + { + // Placeholder record to preserve per-op log sequence numbering. + os << "nv2batchc\n"; + return bool(os); + } + void SubmitDoneOrBatch( + MetaRequest& inReq, + vector& ioBatch, + uint64_t& ioNextBatchTxnId, + int64_t inStartTime, + bool& ioFirstItemFlag) + { + const bool batchFlag = inReq.CanBatchApplyNamespaceV2(); + const uint64_t txnId = batchFlag ? + inReq.GetNamespaceV2BatchTxnId() : uint64_t(0); + if (batchFlag && + (ioBatch.empty() || txnId == ioNextBatchTxnId)) { + ioBatch.push_back(&inReq); + ioNextBatchTxnId = txnId + 1; + return; + } + FlushNamespaceV2Batch(ioBatch, inStartTime, ioFirstItemFlag); + ioNextBatchTxnId = 0; + if (batchFlag) { + ioBatch.push_back(&inReq); + ioNextBatchTxnId = txnId + 1; + } else { + SubmitDoneRequest(inReq, inStartTime, ioFirstItemFlag); + } + } virtual void Timeout() { mDebugHistoryCommittedRing.Process( @@ -1354,6 +1586,9 @@ class LogWriter::Impl : mNetManager.Wakeup(); } Queue theReplayQueue; + vector theNamespaceV2Batch; + theNamespaceV2Batch.reserve(mMaxBlockSize); + uint64_t theNextBatchTxnId = 0; MetaRequest* thePtr; int64_t const theStartTime = microseconds(); bool theFirstItemFlag = true; @@ -1374,25 +1609,28 @@ class LogWriter::Impl : 0 == thePtr->status && MetaLogWriterControl::kWriteBlock == static_cast(thePtr)->type) { + FlushNamespaceV2Batch(theNamespaceV2Batch, theStartTime, + theFirstItemFlag); + theNextBatchTxnId = 0; // Run after setting replay state. theReplayQueue.PushBack(*thePtr); } else if (IsMetaLogWriteOrVrError(thePtr->status) || thePtr->replayBypassFlag || - ! mReplayerPtr->submit(*thePtr)) { - const int64_t theUsecsNow = theFirstItemFlag ? - theStartTime : microseconds(); - theFirstItemFlag = false; - if (META_LOG_WRITER_CONTROL != thePtr->op) { - if (0 == thePtr->status) { - mLogTimeUsec += theStartTime - theReq.submitTime; - mLogTimeOpsCount++; - } else { - mLogErrorOpsCount++; - } + ! mReplayerPtr->isSubmitQueueEnabled()) { + SubmitDoneOrBatch(theReq, theNamespaceV2Batch, + theNextBatchTxnId, theStartTime, theFirstItemFlag); + } else { + FlushNamespaceV2Batch(theNamespaceV2Batch, theStartTime, + theFirstItemFlag); + theNextBatchTxnId = 0; + if (! mReplayerPtr->submit(*thePtr)) { + SubmitDoneOrBatch(theReq, theNamespaceV2Batch, + theNextBatchTxnId, theStartTime, theFirstItemFlag); } - theReq.Submit(theUsecsNow); } } + FlushNamespaceV2Batch(theNamespaceV2Batch, theStartTime, + theFirstItemFlag); UpdateLogAvg(theStartTime); if (theSetReplayStateFlag) { thePtr = theReplayCommitHeadPtr; @@ -1730,14 +1968,73 @@ class LogWriter::Impl : theFailureInjectedFlag = true; break; } - ++mLastLogSeq.mLogSeq; - thePtr->logseq = mLastLogSeq; - if (! thePtr->WriteLog(theStream, mOmitDefaultsFlag)) { - panic("log writer: invalid request"); - } - if (! theStream) { - --mLastLogSeq.mLogSeq; - LogError(*thePtr); + if (IsNamespaceV2WalBatchable(*thePtr)) { + vector batch; + batch.reserve(kNamespaceV2WalBatchMaxCount); + for (MetaRequest* ptr = thePtr; + ptr && batch.size() < kNamespaceV2WalBatchMaxCount && + IsNamespaceV2WalBatchable(*ptr) && + (size_t)(mLastLogSeq.mLogSeq - mNextLogSeq.mLogSeq + + batch.size() + 1) < (size_t)mMaxBlockSize; + ptr = ptr->next) { + batch.push_back(ptr); + } + if (batch.size() <= 1) { + // Let the non-batch path handle it. + batch.clear(); + } + if (! batch.empty() && ! theStream) { + batch.clear(); + } + if (! batch.empty()) { + // Reserve create ids for the whole contiguous batch. + ReserveNamespaceV2CreateIdsBatch(*thePtr); + // Assign a unique log sequence to each op as usual. + for (vector::iterator it = batch.begin(); + it != batch.end(); + ++it) { + ++mLastLogSeq.mLogSeq; + (*it)->logseq = mLastLogSeq; + if (! (*it)->PrepareLog()) { + panic("log writer: invalid request"); + } + } + if (! WriteNamespaceV2WalBatchRecord(theStream, batch)) { + panic("log writer: invalid namespace v2 WAL batch record"); + } + for (size_t i = 1; i < batch.size(); i++) { + if (! WriteNamespaceV2WalBatchContRecord(theStream)) { + panic("log writer: invalid namespace v2 WAL batch cont record"); + } + } + // Skip the rest of the batch: the for-loop will ++thePtr, + // so stop at the last element. + thePtr = batch.back(); + } else { + ++mLastLogSeq.mLogSeq; + thePtr->logseq = mLastLogSeq; + ReserveNamespaceV2CreateIdsBatch(*thePtr); + if (! thePtr->PrepareLog() || + ! thePtr->WriteLog(theStream, mOmitDefaultsFlag)) { + panic("log writer: invalid request"); + } + if (! theStream) { + --mLastLogSeq.mLogSeq; + LogError(*thePtr); + } + } + } else { + ++mLastLogSeq.mLogSeq; + thePtr->logseq = mLastLogSeq; + ReserveNamespaceV2CreateIdsBatch(*thePtr); + if (! thePtr->PrepareLog() || + ! thePtr->WriteLog(theStream, mOmitDefaultsFlag)) { + panic("log writer: invalid request"); + } + if (! theStream) { + --mLastLogSeq.mLogSeq; + LogError(*thePtr); + } } } if (theEndBlockSeq <= mLastLogSeq.mLogSeq || diff --git a/src/cc/meta/MetaRequest.cc b/src/cc/meta/MetaRequest.cc index a5e11851b..fbcd4fc0b 100644 --- a/src/cc/meta/MetaRequest.cc +++ b/src/cc/meta/MetaRequest.cc @@ -37,6 +37,7 @@ #include "ClientSM.h" #include "Replay.h" #include "MetaVrOps.h" +#include "NamespaceV2.h" #include "kfsio/Globals.h" #include "kfsio/checksum.h" @@ -86,6 +87,191 @@ static bool gWormMode = false; static string gChunkmapDumpDir("."); static const char* const ftypes[] = { "empty", "file", "dir" }; +static bool +IsNamespaceV2RpcEnabled() +{ + const NamespaceV2::Config& cfg = NamespaceV2::GetConfig(); + return cfg.enabledFlag && cfg.rpcEnabledFlag; +} + +static bool +UseNamespaceV2RpcPath( + const MetaRequest& req) +{ + return req.namespaceV2LogFlag || + (IsNamespaceV2RpcEnabled() && ! req.replayFlag); +} + +static NamespaceV2::NamespaceStore& +GetNamespaceV2StoreLocked() +{ + return NamespaceV2::GetStore(); +} + +static FileType +NamespaceV2FileType( + NamespaceV2::InodeType type) +{ + return type == NamespaceV2::kInodeTypeDir ? KFS_DIR : KFS_FILE; +} + +class NamespaceV2RpcFattr : public MFattr +{ +public: + void Set( + const NamespaceV2::LookupResult& attr) + { + fid = attr.fid; + type = NamespaceV2FileType(attr.type); + striperType = KFS_STRIPED_FILE_TYPE_NONE; + numReplicas = type == KFS_FILE && 0 < attr.numReplicas ? + attr.numReplicas : 0; + numRecoveryStripes = 0; + numStripes = 0; + stripeSize = 0; + mtime = attr.mtime; + ctime = attr.ctime; + atime = attr.atime; + subcount1 = type == KFS_DIR ? attr.fileCount : 0; + subcount2 = type == KFS_DIR ? attr.dirCount : 0; + filesize = 0; + minSTier = kKfsSTierMax; + maxSTier = kKfsSTierMax; + fattrExtTypes = kFileAttrExtTypeNone; + user = attr.user; + group = attr.group; + mode = attr.mode; + extAttributes.clear(); + } +}; + +static void +NamespaceV2SetFattr( + const NamespaceV2::LookupResult& attr, + MFattr& fattr) +{ + NamespaceV2RpcFattr tmp; + tmp.Set(attr); + fattr = tmp; +} + +static int +NamespaceV2ResolveAbsPathLocked( + NamespaceV2::NamespaceStore& store, + fid_t& dir, + string& name) +{ + if (dir != ROOTFID || name.empty() || name[0] != '/' || + name[name.size() - 1] == '/') { + return 0; + } + const size_t nameStart = name.rfind('/'); + size_t parentEnd = nameStart; + while (parentEnd > 0 && name[parentEnd - 1] == '/') { + --parentEnd; + } + const string leaf = name.substr(nameStart + 1); + if (leaf.empty()) { + return -EINVAL; + } + if (parentEnd == 0) { + name = leaf; + return 0; + } + NamespaceV2::LookupResult parent; + const int status = store.LookupPath(ROOTFID, + name.substr(0, parentEnd), parent); + if (status != 0) { + return status; + } + if (parent.type != NamespaceV2::kInodeTypeDir) { + return -ENOTDIR; + } + dir = parent.fid; + name = leaf; + return 0; +} + +static bool +NamespaceV2HasCreateIds( + fid_t fid, + uint64_t txnId) +{ + return fid >= 0 && txnId != 0; +} + +static bool +NamespaceV2NeedsCreateIds( + fid_t fid, + uint64_t txnId) +{ + return fid < 0 && txnId == 0; +} + +static bool +NamespaceV2ReserveCreateIds( + fid_t& fid, + uint64_t& txnId) +{ + if (fid >= 0 && txnId != 0) { + return true; + } + if (fid >= 0 || txnId != 0) { + return false; + } + NamespaceV2::TxnId reservedTxnId = 0; + GetNamespaceV2StoreLocked().ReserveCreateIds(fid, reservedTxnId); + txnId = reservedTxnId; + return fid >= 0 && txnId != 0; +} + +static int +NamespaceV2ApplyCreateEdit( + fid_t parentFid, + const string& name, + NamespaceV2::InodeType type, + fid_t& fid, + uint64_t& txnId, + kfsUid_t user, + kfsGid_t group, + kfsMode_t mode, + int16_t numReplicas, + int64_t mtime, + bool commitFlag, + bool advanceSeedsFlag = true) +{ + if (! NamespaceV2HasCreateIds(fid, txnId)) { + return -EINVAL; + } + return GetNamespaceV2StoreLocked().ApplyCreateTrusted(parentFid, name, type, fid, + (NamespaceV2::TxnId)txnId, user, group, mode, numReplicas, mtime, + commitFlag, advanceSeedsFlag); +} + +void +MetaRequest::ReserveNamespaceV2CreateIdsBatch( + size_t count, + fid_t& firstFid, + uint64_t& firstTxnId) +{ + NamespaceV2::TxnId firstTxn = 0; + GetNamespaceV2StoreLocked().ReserveCreateIdsRange( + count, firstFid, firstTxn); + firstTxnId = firstTxn; +} + +void +MetaRequest::CommitNamespaceV2Batch( + uint64_t firstTxnId, + uint64_t lastTxnId) +{ + if (firstTxnId != 0 && lastTxnId != 0) { + GetNamespaceV2StoreLocked().CommitThroughRange( + (NamespaceV2::TxnId)firstTxnId, + (NamespaceV2::TxnId)lastTxnId); + } +} + class StIdempotentRequestHandler { public: @@ -701,6 +887,16 @@ MetaLookup::handle() } authType = kAuthenticationTypeUndef; // always reset if op gets here. SetEUserAndEGroup(*this); + if (UseNamespaceV2RpcPath(*this)) { + NamespaceV2::LookupResult attr; + NamespaceV2::NamespaceStore& store = GetNamespaceV2StoreLocked(); + status = (dir == ROOTFID && name == "/") ? + store.GetAttr(ROOTFID, attr) : store.Lookup(dir, name, attr); + if (status == 0) { + NamespaceV2SetFattr(attr, fattr); + } + return; + } MetaFattr* fa = 0; if ((status = metatree.lookup(dir, name, euser, egroup, fa)) == 0) { FattrReply(fa, fattr); @@ -720,6 +916,14 @@ MetaLookupPath::handle() return; } SetEUserAndEGroup(*this); + if (UseNamespaceV2RpcPath(*this)) { + NamespaceV2::LookupResult attr; + status = GetNamespaceV2StoreLocked().LookupPath(root, path, attr); + if (status == 0) { + NamespaceV2SetFattr(attr, fattr); + } + return; + } MetaFattr* fa = 0; if ((status = metatree.lookupPath( root, path, euser, egroup, fa)) == 0) { @@ -875,6 +1079,57 @@ MetaIdempotentRequest::IsHandled() const string kInvalidChunksPath("/proc/invalid_chunks"); const string kInvalidChunksPrefix(kInvalidChunksPath + "/"); +/* virtual */ bool +MetaCreate::PrepareLog() +{ + return ! namespaceV2LogFlag || + NamespaceV2ReserveCreateIds(fid, namespaceV2TxnId); +} + +/* virtual */ bool +MetaCreate::NeedsNamespaceV2CreateIds() const +{ + return namespaceV2LogFlag && ! replayFlag && ! replayBypassFlag && + status == 0 && NamespaceV2NeedsCreateIds(fid, namespaceV2TxnId); +} + +/* virtual */ bool +MetaCreate::SetNamespaceV2CreateIds( + fid_t inFid, + uint64_t inTxnId) +{ + if (! NeedsNamespaceV2CreateIds() || inFid < 0 || inTxnId == 0) { + return false; + } + fid = inFid; + namespaceV2TxnId = inTxnId; + return true; +} + +/* virtual */ bool +MetaCreate::CanBatchApplyNamespaceV2() const +{ + return namespaceV2LogFlag && ! namespaceV2AppliedFlag && + ! replayFlag && ! replayBypassFlag && status == 0 && + logseq.IsValid() && NamespaceV2HasCreateIds(fid, namespaceV2TxnId); +} + +/* virtual */ uint64_t +MetaCreate::GetNamespaceV2BatchTxnId() const +{ + return namespaceV2TxnId; +} + +/* virtual */ void +MetaCreate::ApplyNamespaceV2Batch( + bool commitFlag) +{ + status = NamespaceV2ApplyCreateEdit(dir, name, + NamespaceV2::kInodeTypeFile, fid, namespaceV2TxnId, + user, group, mode, numReplicas, mtime, commitFlag, false); + namespaceV2AppliedFlag = true; +} + /* virtual */ bool MetaCreate::start() { @@ -888,6 +1143,58 @@ MetaCreate::start() if (0 != status) { return false; } + if (IsNamespaceV2RpcEnabled()) { + const bool kDirFlag = false; + if (! CheckCreatePerms(*this, kDirFlag)) { + return false; + } + if (gWormMode && ! IsWormMutationAllowed(name)) { + statusMsg = "worm mode"; + status = -EPERM; + return false; + } + fid = -1; + const bool wasNotObjectStoreFileFlag = 0 < numReplicas; + if (striperType != KFS_STRIPED_FILE_TYPE_NONE && + 0 < numRecoveryStripes) { + numReplicas = min(numReplicas, + gLayoutManager.GetMaxReplicasPerRSFile()); + } else { + numReplicas = min(numReplicas, + gLayoutManager.GetMaxReplicasPerFile()); + } + if (0 == numReplicas && wasNotObjectStoreFileFlag && + gLayoutManager.IsObjectStoreEnabled()) { + striperType = KFS_STRIPED_FILE_TYPE_NONE; + numRecoveryStripes = 0; + numStripes = 0; + stripeSize = 0; + if (minSTier < kKfsSTierMax) { + maxSTier = minSTier; + } + } + if (maxSTier < minSTier || ! IsValidSTier(minSTier) || + ! IsValidSTier(maxSTier)) { + status = -EINVAL; + statusMsg = "invalid storage tier range"; + return false; + } + if (minSTier < kKfsSTierMax && 0 == numReplicas && + minSTier != maxSTier) { + status = -EINVAL; + statusMsg = "storage tier range is not supported with object store files"; + return false; + } + if (! gLayoutManager.Validate(*this)) { + if (0 <= status) { + status = -EINVAL; + } + return false; + } + mtime = microseconds(); + namespaceV2LogFlag = true; + return true; + } const bool invalChunkFlag = dir == ROOTFID && startsWith(name, kInvalidChunksPrefix); if (invalChunkFlag) { @@ -1019,6 +1326,15 @@ MetaCreate::handle() if (IsHandled()) { return; } + if (namespaceV2AppliedFlag) { + return; + } + if (UseNamespaceV2RpcPath(*this)) { + status = NamespaceV2ApplyCreateEdit(dir, name, + NamespaceV2::kInodeTypeFile, fid, namespaceV2TxnId, + user, group, mode, numReplicas, mtime, true); + return; + } fid = 0; MetaFattr* fa = 0; bool const kToDumpsterFlag = true; @@ -1057,6 +1373,57 @@ MetaCreate::handle() } } +/* virtual */ bool +MetaMkdir::PrepareLog() +{ + return ! namespaceV2LogFlag || + NamespaceV2ReserveCreateIds(fid, namespaceV2TxnId); +} + +/* virtual */ bool +MetaMkdir::NeedsNamespaceV2CreateIds() const +{ + return namespaceV2LogFlag && ! replayFlag && ! replayBypassFlag && + status == 0 && NamespaceV2NeedsCreateIds(fid, namespaceV2TxnId); +} + +/* virtual */ bool +MetaMkdir::SetNamespaceV2CreateIds( + fid_t inFid, + uint64_t inTxnId) +{ + if (! NeedsNamespaceV2CreateIds() || inFid < 0 || inTxnId == 0) { + return false; + } + fid = inFid; + namespaceV2TxnId = inTxnId; + return true; +} + +/* virtual */ bool +MetaMkdir::CanBatchApplyNamespaceV2() const +{ + return namespaceV2LogFlag && ! namespaceV2AppliedFlag && + ! replayFlag && ! replayBypassFlag && status == 0 && + logseq.IsValid() && NamespaceV2HasCreateIds(fid, namespaceV2TxnId); +} + +/* virtual */ uint64_t +MetaMkdir::GetNamespaceV2BatchTxnId() const +{ + return namespaceV2TxnId; +} + +/* virtual */ void +MetaMkdir::ApplyNamespaceV2Batch( + bool commitFlag) +{ + status = NamespaceV2ApplyCreateEdit(dir, name, + NamespaceV2::kInodeTypeDir, fid, namespaceV2TxnId, + user, group, mode, 0, mtime, commitFlag, false); + namespaceV2AppliedFlag = true; +} + /* virtual */ bool MetaMkdir::start() { @@ -1074,6 +1441,9 @@ MetaMkdir::start() if (! CheckCreatePerms(*this, kDirFlag)) { return false; } + if (IsNamespaceV2RpcEnabled()) { + namespaceV2LogFlag = true; + } if (0 == status) { mtime = microseconds(); } @@ -1086,6 +1456,15 @@ MetaMkdir::handle() if (IsHandled()) { return; } + if (namespaceV2AppliedFlag) { + return; + } + if (UseNamespaceV2RpcPath(*this)) { + status = NamespaceV2ApplyCreateEdit(dir, name, + NamespaceV2::kInodeTypeDir, fid, namespaceV2TxnId, + user, group, mode, 0, mtime, true); + return; + } fid = 0; MetaFattr* fa = 0; status = metatree.mkdir( @@ -1150,6 +1529,9 @@ MetaRemove::start() if (0 == status) { mtime = microseconds(); } + if (IsNamespaceV2RpcEnabled()) { + namespaceV2LogFlag = true; + } return (0 == status); } @@ -1159,6 +1541,18 @@ MetaRemove::handle() if (IsHandled()) { return; } + if (UseNamespaceV2RpcPath(*this)) { + NamespaceV2::NamespaceStore& store = GetNamespaceV2StoreLocked(); + status = NamespaceV2ResolveAbsPathLocked(store, dir, name); + if (status == 0) { + NamespaceV2::TxnId txnId = 0; + status = store.RemoveFile(dir, name, &txnId); + if (status == 0) { + store.CommitThrough(txnId); + } + } + return; + } if ((status = LookupAbsPath(dir, name, euser, egroup)) != 0) { return; } @@ -1184,6 +1578,9 @@ MetaRmdir::start() if (0 == status) { mtime = microseconds(); } + if (IsNamespaceV2RpcEnabled()) { + namespaceV2LogFlag = true; + } return (0 == status); } @@ -1193,6 +1590,18 @@ MetaRmdir::handle() if (IsHandled()) { return; } + if (UseNamespaceV2RpcPath(*this)) { + NamespaceV2::NamespaceStore& store = GetNamespaceV2StoreLocked(); + status = NamespaceV2ResolveAbsPathLocked(store, dir, name); + if (status == 0) { + NamespaceV2::TxnId txnId = 0; + status = store.Rmdir(dir, name, &txnId); + if (status == 0) { + store.CommitThrough(txnId); + } + } + return; + } if ((status = LookupAbsPath(dir, name, euser, egroup)) != 0) { return; } @@ -1243,6 +1652,71 @@ MetaReaddir::handle() } numEntries = 0; resp.Clear(); + if (UseNamespaceV2RpcPath(*this)) { + NamespaceV2::NamespaceStore& store = GetNamespaceV2StoreLocked(); + NamespaceV2::LookupResult dirAttr; + status = store.GetAttr(dir, dirAttr); + if (status != 0) { + return; + } + if (dirAttr.type != NamespaceV2::kInodeTypeDir) { + status = -ENOTDIR; + return; + } + SetEUserAndEGroup(*this); + NamespaceV2RpcFattr dirFattr; + dirFattr.Set(dirAttr); + if (! dirFattr.CanRead(euser, egroup)) { + status = -EACCES; + return; + } + NamespaceV2::ReaddirResult result; + const size_t v2MaxEntries = 0 < maxEntries ? (size_t)maxEntries : + numeric_limits::max(); + status = fnameStart.empty() ? store.Readdir(dir, 0, + v2MaxEntries, result) : store.ReaddirFromName(dir, fnameStart, + v2MaxEntries, result); + if (status != 0) { + return; + } + hasMoreEntriesFlag = result.moreEntriesFlag; + if (oldFormatFlag && hasMoreEntriesFlag) { + status = -ENOMEM; + statusMsg = "response exceeds max. allowed number of entries" + " consider updating kfs client lib"; + return; + } + const int extSize = IOBufferData::GetDefaultBufferSize() + + int(MAX_FILE_NAME_LENGTH); + int maxSize = gLayoutManager.GetMaxResponseSize(); + if (! oldFormatFlag && extSize * 2 < maxSize) { + maxSize -= extSize; + } + IOBufferWriter writer(resp); + size_t i = 0; + for (; i < result.entries.size() && writer.GetSize() <= maxSize; + ++i) { + const string& entryName = result.entries[i].key.name; + if (dir == ROOTFID && entryName == "/") { + continue; + } + writer.Write(entryName); + writer.Write("\n", 1); + ++numEntries; + } + writer.Close(); + if (resp.BytesConsumable() > maxSize) { + if (oldFormatFlag) { + resp.Clear(); + numEntries = 0; + status = -ENOMEM; + statusMsg = "response exceeds max. size"; + } else if (i < result.entries.size()) { + hasMoreEntriesFlag = true; + } + } + return; + } vector& v = GetReadDirTmpVec(); if ((status = fnameStart.empty() ? metatree.readdir(dir, v, @@ -1806,6 +2280,91 @@ MetaReaddirPlus::handle() (maxEntries <= 0 || numEntries < maxEntries)) { maxEntries = numEntries; } + if (UseNamespaceV2RpcPath(*this)) { + NamespaceV2::NamespaceStore& store = GetNamespaceV2StoreLocked(); + NamespaceV2::LookupResult dirAttr; + status = store.GetAttr(dir, dirAttr); + if (status != 0) { + return; + } + if (dirAttr.type != NamespaceV2::kInodeTypeDir) { + status = -ENOTDIR; + return; + } + SetEUserAndEGroup(*this); + NamespaceV2RpcFattr dirFattr; + dirFattr.Set(dirAttr); + if (! dirFattr.CanRead(euser, egroup)) { + status = -EACCES; + return; + } + noAttrsFlag = ! dirFattr.CanSearch(euser, egroup); + NamespaceV2::ReaddirResult result; + const size_t v2MaxEntries = 0 < maxEntries ? (size_t)maxEntries : + numeric_limits::max(); + status = fnameStart.empty() ? store.Readdir(dir, 0, + v2MaxEntries, result) : store.ReaddirFromName(dir, fnameStart, + v2MaxEntries, result); + if (status != 0) { + return; + } + hasMoreEntriesFlag = result.moreEntriesFlag; + if (numEntries < 0 && hasMoreEntriesFlag) { + status = -ENOMEM; + statusMsg = "response exceeds max. allowed number of entries" + " consider updating kfs client lib"; + return; + } + maxRespSize = max(0, gLayoutManager.GetMaxResponseSize()); + const int extSize = IOBufferData::GetDefaultBufferSize() + + int(MAX_FILE_NAME_LENGTH); + const size_t maxSize = (size_t)((numEntries >= 0 && + extSize * 2 < maxRespSize) ? maxRespSize - extSize : + maxRespSize); + dentries.reserve(result.entries.size() + (fnameStart.empty() ? 2 : 0)); + omitLastChunkInfoFlag = true; + size_t responseSize = 0; + if (fnameStart.empty()) { + dentries.push_back(DEntry(dirFattr, ".")); + dentries.push_back(DEntry(dirFattr, "..")); + responseSize += 2 * 148 + 3; + } + size_t i = 0; + for (; i < result.entries.size() && responseSize <= maxSize; ++i) { + const NamespaceV2::ReaddirResult::Entry& entry = + result.entries[i]; + NamespaceV2::LookupResult attr; + if (store.GetAttr(entry.childFid, attr) != 0) { + continue; + } + NamespaceV2RpcFattr fa; + fa.Set(attr); + const string& entryName = entry.key.name; + if (fa.id() == ROOTFID && entryName == "/") { + continue; + } + responseSize += entryName.length() + + (fa.type == KFS_DIR ? 148 : 272); + dentries.push_back(DEntry(fa, entryName)); + } + if (maxSize < responseSize) { + if (numEntries < 0) { + status = -ENOMEM; + statusMsg = "response exceeds max. size"; + dentries.clear(); + responseSize = 0; + } else if (i < result.entries.size()) { + hasMoreEntriesFlag = true; + } + } + ioBufPending = (int64_t)responseSize; + if (ioBufPending > 0) { + gLayoutManager.ChangeIoBufPending(ioBufPending); + maxRespSize = (int)max((int64_t)maxRespSize, ioBufPending + + IOBufferData::GetDefaultBufferSize()); + } + return; + } vector& res = GetReadDirTmpVec(); if ((status = fnameStart.empty() ? metatree.readdir(dir, res, @@ -2011,6 +2570,13 @@ MetaGetalloc::handle() return; } if (err) { + if (gLayoutManager.ScheduleTruncateToLastRecoverableChunk( + fid, chunkId, chunkOff_t(-1))) { + status = -EAGAIN; + statusMsg = "truncating unrecoverable tail chunk: "; + AppendDecIntToString(statusMsg, chunkId); + return; + } status = -EAGAIN; statusMsg = "no replicas available chunk: "; AppendDecIntToString(statusMsg, chunkId); @@ -2111,6 +2677,13 @@ MetaGetlayout::handle() assert(! fa || cfa == fa); if (err && ! continueIfNoReplicasFlag) { resp.Clear(); + if (gLayoutManager.ScheduleTruncateToLastRecoverableChunk( + fid, l.chunkId, chunkOff_t(-1))) { + status = -EAGAIN; + statusMsg = "truncating unrecoverable tail chunk: "; + AppendDecIntToString(statusMsg, l.chunkId); + break; + } status = -EHOSTUNREACH; statusMsg = "no replicas available chunk: "; AppendDecIntToString(statusMsg, l.chunkId); @@ -2182,6 +2755,9 @@ MetaAllocate::dispatch(ClientSM& sm) MetaAllocate::handle() { assert(! MetaRequest::next); + if (debugStartUsec <= 0) { + debugStartUsec = microseconds(); + } suspended = false; if (startedFlag) { return; @@ -2330,8 +2906,10 @@ MetaAllocate::handle() } return; } + debugBeforeLayoutUsec = microseconds(); suspended = true; const int ret = gLayoutManager.AllocateChunk(*this, chunkBlock); + debugAfterLayoutUsec = microseconds(); if (0 == ret) { return; } @@ -2343,6 +2921,7 @@ MetaAllocate::handle() void MetaAllocate::LayoutDone(int64_t chunkAllocProcessTime) { + debugLayoutDoneUsec = microseconds(); suspended = false; if (0 == status) { // Check if all servers are still up, and didn't go down @@ -2385,8 +2964,12 @@ MetaAllocate::LayoutDone(int64_t chunkAllocProcessTime) } if (0 == status) { assert(! MetaRequest::next); + debugLogStartUsec = microseconds(); suspended = true; submit_request(new MetaLogChunkAllocate(this)); + // This log request is submitted while processing another request, so it + // can bypass the client thread's end-of-batch log flush trigger. + GetLogWriter().ScheduleFlush(); return; } const bool kCountAllocTimeFlag = true; @@ -2609,6 +3192,30 @@ MetaAllocate::Done(bool countAllocTimeFlag, int64_t chunkAllocProcessTime) processTime += microseconds() - chunkAllocProcessTime; } if (! next) { + const int64_t now = microseconds(); + const int64_t totalUsec = debugStartUsec > 0 ? now - debugStartUsec : 0; + if (100000 <= totalUsec) { + KFS_LOG_STREAM_INFO << + "allocate timing:" + " seq: " << opSeqno << + " fid: " << fid << + " chunk: " << chunkId << + " status: " << status << + " total-usec: " << totalUsec << + " pre-layout-usec: " << + (debugBeforeLayoutUsec > debugStartUsec ? + debugBeforeLayoutUsec - debugStartUsec : 0) << + " layout-call-usec: " << + (debugAfterLayoutUsec > debugBeforeLayoutUsec ? + debugAfterLayoutUsec - debugBeforeLayoutUsec : 0) << + " wait-chunk-usec: " << + (debugLayoutDoneUsec > debugAfterLayoutUsec ? + debugLayoutDoneUsec - debugAfterLayoutUsec : 0) << + " log-wait-usec: " << + (debugLogStartUsec > 0 ? now - debugLogStartUsec : 0) << + " servers: " << servers.size() << + KFS_LOG_EOM; + } submit_request(this); return; } @@ -2938,6 +3545,9 @@ MetaRename::start() } if (0 == status) { mtime = microseconds(); + if (IsNamespaceV2RpcEnabled()) { + namespaceV2LogFlag = true; + } } return (0 == status); } @@ -2952,10 +3562,19 @@ MetaRename::handle() // renames are disabled in WORM mode: otherwise, we // ocould overwrite an existing file srcFid = -1; - bool const kToDumpsterFlag = true; - status = metatree.rename(dir, oldname, newname, - oldpath, overwrite && ! wormModeFlag, euser, egroup, - mtime, &srcFid, kToDumpsterFlag); + if (UseNamespaceV2RpcPath(*this)) { + NamespaceV2::TxnId txnId = 0; + status = GetNamespaceV2StoreLocked().Rename(dir, oldname, + newname, overwrite && ! wormModeFlag, &txnId, &srcFid); + if (status == 0 && txnId != 0) { + GetNamespaceV2StoreLocked().CommitThrough(txnId); + } + } else { + bool const kToDumpsterFlag = true; + status = metatree.rename(dir, oldname, newname, + oldpath, overwrite && ! wormModeFlag, euser, egroup, + mtime, &srcFid, kToDumpsterFlag); + } if (wormModeFlag && -EEXIST == status) { statusMsg = "worm mode"; status = -EPERM; @@ -5050,6 +5669,9 @@ MetaAllocate::responseSelf(ReqOstream& os) (shortRpcFormatFlag ? "H:" : "Chunk-handle: ") << chunkId << "\r\n" << (shortRpcFormatFlag ? "V:" : "Chunk-version: ") << (0 == numReplicas ? -chunkVersion - 1 : chunkVersion) << "\r\n"; + if (0 <= leaseId) { + os << (shortRpcFormatFlag ? "L:" : "Lease-id: ") << leaseId << "\r\n"; + } if (appendChunk) { os << (shortRpcFormatFlag ? "O:" : "Chunk-offset: ") << offset << "\r\n"; diff --git a/src/cc/meta/MetaRequest.h b/src/cc/meta/MetaRequest.h index f61cc5bcf..8f12ac3c1 100644 --- a/src/cc/meta/MetaRequest.h +++ b/src/cc/meta/MetaRequest.h @@ -283,6 +283,7 @@ struct MetaRequest { bool replayFlag; bool commitPendingFlag; bool replayBypassFlag; + bool namespaceV2LogFlag; string clientIp; string clientReportedIp; string nodeId; @@ -317,6 +318,7 @@ struct MetaRequest { replayFlag(false), commitPendingFlag(false), replayBypassFlag(false), + namespaceV2LogFlag(false), clientIp(), clientReportedIp(), nodeId(), @@ -344,6 +346,16 @@ struct MetaRequest { //!< response to be sent back as per the KFS protocol. virtual void response(ReqOstream& os, IOBuffer& /* buf */) { response(os); } virtual bool log(ostream& file) const; + virtual bool PrepareLog() { return true; } + virtual bool NeedsNamespaceV2CreateIds() const { return false; } + virtual bool SetNamespaceV2CreateIds( + fid_t /* fid */, uint64_t /* txnId */) { return false; } + static void ReserveNamespaceV2CreateIdsBatch( + size_t count, fid_t& firstFid, uint64_t& firstTxnId); + virtual bool CanBatchApplyNamespaceV2() const { return false; } + virtual uint64_t GetNamespaceV2BatchTxnId() const { return 0; } + virtual void ApplyNamespaceV2Batch(bool /* commitFlag */) {} + static void CommitNamespaceV2Batch(uint64_t firstTxnId, uint64_t lastTxnId); Display Show() const { return Display(*this); } virtual void setChunkServer(const ChunkServerPtr& /* cs */) {}; bool ValidateRequestHeader( @@ -395,6 +407,7 @@ struct MetaRequest { .Def("u", &MetaRequest::euser, kKfsUserNone) .Def("g", &MetaRequest::egroup, kKfsGroupNone) .Def("a", &MetaRequest::authUid, kKfsUserNone) + .Def("V2", &MetaRequest::namespaceV2LogFlag, false) .Def("z", &MetaRequest::logseq) .Def("x", &MetaRequest::shortRpcFormatFlag, true) ; @@ -468,6 +481,7 @@ struct MetaRequest { replayFlag = false; commitPendingFlag = false; replayBypassFlag = false; + namespaceV2LogFlag = false; clientIp = string(); nodeId = string(); reqHeaders.Clear(); @@ -695,6 +709,8 @@ struct MetaCreate: public MetaIdempotentRequest { string name; //!< name to create string ownerName; string groupName; + uint64_t namespaceV2TxnId; + bool namespaceV2AppliedFlag; int64_t mtime; MetaCreate() : MetaIdempotentRequest(META_CREATE, kLogIfOk), @@ -714,10 +730,18 @@ struct MetaCreate: public MetaIdempotentRequest { name(), ownerName(), groupName(), + namespaceV2TxnId(0), + namespaceV2AppliedFlag(false), mtime() {} virtual bool start(); virtual void handle(); + virtual bool PrepareLog(); + virtual bool NeedsNamespaceV2CreateIds() const; + virtual bool SetNamespaceV2CreateIds(fid_t fid, uint64_t txnId); + virtual bool CanBatchApplyNamespaceV2() const; + virtual uint64_t GetNamespaceV2BatchTxnId() const; + virtual void ApplyNamespaceV2Batch(bool commitFlag); virtual void response(ReqOstream &os); virtual ostream& ShowSelf(ostream& os) const { @@ -784,6 +808,8 @@ struct MetaCreate: public MetaIdempotentRequest { .Def("SS", &MetaCreate::stripeSize, int32_t(0)) .Def("E", &MetaCreate::exclusive, false) .Def("N", &MetaCreate::name) + .Def("H", &MetaCreate::fid, fid_t(-1)) + .Def("VT", &MetaCreate::namespaceV2TxnId, uint64_t(0)) .Def("O", &MetaCreate::user, kKfsUserNone) .Def("G", &MetaCreate::group, kKfsGroupNone) .Def("M", &MetaCreate::mode, kKfsModeUndef) @@ -809,6 +835,8 @@ struct MetaMkdir: public MetaIdempotentRequest { string ownerName; string groupName; int64_t mtime; + uint64_t namespaceV2TxnId; + bool namespaceV2AppliedFlag; MetaMkdir() : MetaIdempotentRequest(META_MKDIR, kLogIfOk), dir(-1), @@ -821,10 +849,18 @@ struct MetaMkdir: public MetaIdempotentRequest { name(), ownerName(), groupName(), - mtime() + mtime(), + namespaceV2TxnId(0), + namespaceV2AppliedFlag(false) {} virtual bool start(); virtual void handle(); + virtual bool PrepareLog(); + virtual bool NeedsNamespaceV2CreateIds() const; + virtual bool SetNamespaceV2CreateIds(fid_t fid, uint64_t txnId); + virtual bool CanBatchApplyNamespaceV2() const; + virtual uint64_t GetNamespaceV2BatchTxnId() const; + virtual void ApplyNamespaceV2Batch(bool commitFlag); virtual void response(ReqOstream &os); virtual ostream& ShowSelf(ostream& os) const { @@ -875,6 +911,8 @@ struct MetaMkdir: public MetaIdempotentRequest { return MetaIdempotentRequest::LogIoDef(parser) .Def("P", &MetaMkdir::dir, fid_t(-1)) .Def("N", &MetaMkdir::name ) + .Def("H", &MetaMkdir::fid, fid_t(-1)) + .Def("VT", &MetaMkdir::namespaceV2TxnId, uint64_t(0)) .Def("U", &MetaMkdir::user, kKfsUserNone) .Def("G", &MetaMkdir::group, kKfsGroupNone) .Def("M", &MetaMkdir::mode, kKfsModeUndef) @@ -1358,6 +1396,11 @@ struct MetaAllocate: public MetaRequest, public KfsCallbackObj { bool allChunkServersShortRpcFlag; bool logChunkVersionChangeFailedFlag; bool stoppedServicingFlag; + int64_t debugStartUsec; + int64_t debugBeforeLayoutUsec; + int64_t debugAfterLayoutUsec; + int64_t debugLayoutDoneUsec; + int64_t debugLogStartUsec; TokenSeq tokenSeq; time_t issuedTime; int validForTime; @@ -1404,6 +1447,11 @@ struct MetaAllocate: public MetaRequest, public KfsCallbackObj { allChunkServersShortRpcFlag(false), logChunkVersionChangeFailedFlag(false), stoppedServicingFlag(false), + debugStartUsec(0), + debugBeforeLayoutUsec(0), + debugAfterLayoutUsec(0), + debugLayoutDoneUsec(0), + debugLogStartUsec(0), tokenSeq(), issuedTime(), validForTime(0), diff --git a/src/cc/meta/NamespaceV2.cc b/src/cc/meta/NamespaceV2.cc new file mode 100644 index 000000000..eac86c228 --- /dev/null +++ b/src/cc/meta/NamespaceV2.cc @@ -0,0 +1,2969 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Memory-native namespace scaffolding for RFC-0001. +// +// Copyright 2026 Quantcast Corporation. All rights reserved. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0. +// +//---------------------------------------------------------------------------- + +#include "NamespaceV2.h" + +#include "common/Properties.h" +#include "common/hsieh_hash.h" +#include "common/time.h" +#include "qcdio/QCMutex.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace KFS +{ +namespace NamespaceV2 +{ + +namespace +{ + const TxnId kNoTxn = 0; + Config sConfig; + enum { kNamespaceV2ShardCount = 1024 }; + + size_t + GetLockShard( + fid_t fid) + { + return (size_t)((uint64_t)fid * 11400714819323198485ULL) % + kNamespaceV2ShardCount; + } + + QCMutex& + GetTxnMutex() + { + static QCMutex sMutex; + return sMutex; + } + + QCMutex* + GetDirShardMutexes() + { + static QCMutex sLocks[kNamespaceV2ShardCount]; + return sLocks; + } + + QCMutex* + GetInodeShardMutexes() + { + static QCMutex sLocks[kNamespaceV2ShardCount]; + return sLocks; + } + + QCMutex& + GetDirShardMutex( + fid_t fid) + { + return GetDirShardMutexes()[GetLockShard(fid)]; + } + + QCMutex& + GetInodeShardMutex( + fid_t fid) + { + return GetInodeShardMutexes()[GetLockShard(fid)]; + } + + void + AddMutex( + std::vector& locks, + QCMutex& mutex) + { + locks.push_back(&mutex); + } + + void + AddDirShardMutex( + std::vector& locks, + fid_t fid) + { + AddMutex(locks, GetDirShardMutex(fid)); + } + + void + AddInodeShardMutex( + std::vector& locks, + fid_t fid) + { + AddMutex(locks, GetInodeShardMutex(fid)); + } + + void + AddAllDirShardMutexes( + std::vector& locks) + { + QCMutex* const mutexes = GetDirShardMutexes(); + for (size_t i = 0; i < kNamespaceV2ShardCount; ++i) { + AddMutex(locks, mutexes[i]); + } + } + + void + AddAllInodeShardMutexes( + std::vector& locks) + { + QCMutex* const mutexes = GetInodeShardMutexes(); + for (size_t i = 0; i < kNamespaceV2ShardCount; ++i) { + AddMutex(locks, mutexes[i]); + } + } + + class ScopedMutex + { + public: + explicit ScopedMutex( + QCMutex& mutex) + : mMutex(mutex) + { + mMutex.Lock(); + } + ~ScopedMutex() + { + mMutex.Unlock(); + } + private: + QCMutex& mMutex; + + ScopedMutex(const ScopedMutex&); + ScopedMutex& operator=(const ScopedMutex&); + }; + + class ScopedMutexGroup + { + public: + explicit ScopedMutexGroup( + std::vector locks) + : mLocks(locks) + { + std::sort(mLocks.begin(), mLocks.end()); + mLocks.erase(std::unique(mLocks.begin(), mLocks.end()), + mLocks.end()); + for (std::vector::iterator it = mLocks.begin(); + it != mLocks.end(); + ++it) { + (*it)->Lock(); + } + } + ~ScopedMutexGroup() + { + for (std::vector::reverse_iterator it = + mLocks.rbegin(); + it != mLocks.rend(); + ++it) { + (*it)->Unlock(); + } + } + private: + std::vector mLocks; + + ScopedMutexGroup(const ScopedMutexGroup&); + ScopedMutexGroup& operator=(const ScopedMutexGroup&); + }; + + static void + SortUniqueMutexPtrs( + QCMutex** locks, + size_t& count) + { + if (count <= 1) { + return; + } + if (count == 2) { + if (locks[0] > locks[1]) { + QCMutex* const tmp = locks[0]; + locks[0] = locks[1]; + locks[1] = tmp; + } + return; + } + if (count == 3) { + if (locks[0] > locks[1]) { + QCMutex* const tmp = locks[0]; + locks[0] = locks[1]; + locks[1] = tmp; + } + if (locks[1] > locks[2]) { + QCMutex* const tmp = locks[1]; + locks[1] = locks[2]; + locks[2] = tmp; + } + if (locks[0] > locks[1]) { + QCMutex* const tmp = locks[0]; + locks[0] = locks[1]; + locks[1] = tmp; + } + return; + } + if (count == 4) { + for (size_t i = 1; i < count; ++i) { + QCMutex* const key = locks[i]; + size_t j = i; + while (j > 0 && locks[j - 1] > key) { + locks[j] = locks[j - 1]; + --j; + } + locks[j] = key; + } + return; + } + std::sort(locks, locks + count); + } + + class ScopedSmallMutexGroup + { + public: + ScopedSmallMutexGroup( + QCMutex* lock0, + QCMutex* lock1, + QCMutex* lock2 = 0, + QCMutex* lock3 = 0) + : mCount(0) + { + Add(lock0); + Add(lock1); + Add(lock2); + Add(lock3); + SortUniqueMutexPtrs(mLocks, mCount); + mCount = DedupeMutexPtrs(mLocks, mCount); + for (size_t i = 0; i < mCount; ++i) { + mLocks[i]->Lock(); + } + } + ~ScopedSmallMutexGroup() + { + while (mCount > 0) { + mLocks[--mCount]->Unlock(); + } + } + private: + static size_t + DedupeMutexPtrs( + QCMutex** locks, + size_t count) + { + if (count <= 1) { + return count; + } + size_t out = 1; + for (size_t i = 1; i < count; ++i) { + if (locks[i] != locks[out - 1]) { + locks[out++] = locks[i]; + } + } + return out; + } + void Add(QCMutex* mutex) + { + if (mutex) { + mLocks[mCount++] = mutex; + } + } + + QCMutex* mLocks[4]; + size_t mCount; + + ScopedSmallMutexGroup(const ScopedSmallMutexGroup&); + ScopedSmallMutexGroup& operator=(const ScopedSmallMutexGroup&); + }; + + + uint64_t + HashName( + const std::string& name) + { + Hsieh_hash_fcn hash; + return (uint64_t(hash(name)) << 4); + } + + bool + IsDeletedAt( + const VersionedDirEntry& entry, + TxnId committedTxn) + { + return entry.deleteTxn != kNoTxn && entry.deleteTxn <= committedTxn; + } + + bool + IsLegalName( + const std::string& name) + { + return ! name.empty() && name.size() <= MAX_FILE_NAME_LENGTH && + name.find_first_of("/\n") == std::string::npos; + } + + bool + IsSupportedInodeType( + InodeType type) + { + return type == kInodeTypeFile || type == kInodeTypeDir || + type == kInodeTypeSymlink; + } + + bool + IsVisibleOrPendingEntry( + const VersionedDirEntry* entry, + TxnId committedTxn) + { + if (! entry) { + return false; + } + if (entry->deleteTxn == kNoTxn) { + return true; + } + if (! entry->pendingFlag) { + return false; + } + return committedTxn == kNoTxn || committedTxn < entry->deleteTxn; + } + + int + ValidateCreateRequest( + const std::string& name, + InodeType type) + { + if (! IsLegalName(name) || ! IsSupportedInodeType(type)) { + return -EINVAL; + } + return 0; + } + + int + ValidateCreateRequest( + const std::string& name, + InodeType type, + fid_t childFid, + TxnId txnId, + TxnId committedTxn) + { + const int status = ValidateCreateRequest(name, type); + if (status != 0) { + return status; + } + if (childFid < 0 || txnId == kNoTxn || txnId <= committedTxn) { + return -EINVAL; + } + return 0; + } + + char + HexDigit( + int value) + { + return (char)(value < 10 ? 48 + value : 97 + value - 10); + } + + int + HexValue( + char value) + { + return 48 <= value && value <= 57 ? value - 48 : + (97 <= value && value <= 102 ? value - 97 + 10 : + (65 <= value && value <= 70 ? value - 65 + 10 : -1)); + } + + std::string + EncodeString( + const std::string& value) + { + std::string result; + result.reserve(value.size() * 2); + for (size_t i = 0; i < value.size(); ++i) { + const int byte = (unsigned char)value[i]; + result.push_back(HexDigit((byte >> 4) & 0xf)); + result.push_back(HexDigit(byte & 0xf)); + } + return result; + } + + std::string + EncodeName( + const std::string& name) + { + return EncodeString(name); + } + + bool + DecodeHexString( + const std::string& encoded, + std::string& value) + { + if (encoded.size() % 2 != 0) { + return false; + } + std::string result; + result.reserve(encoded.size() / 2); + for (size_t i = 0; i < encoded.size(); i += 2) { + const int hi = HexValue(encoded[i]); + const int lo = HexValue(encoded[i + 1]); + if (hi < 0 || lo < 0) { + return false; + } + result.push_back((char)((hi << 4) | lo)); + } + value.swap(result); + return true; + } + + bool + DecodeName( + const std::string& encoded, + std::string& name) + { + std::string result; + if (! DecodeHexString(encoded, result) || ! IsLegalName(result)) { + return false; + } + name.swap(result); + return true; + } + + bool + DecodePath( + const std::string& encoded, + std::string& path) + { + std::string result; + if (! DecodeHexString(encoded, result) || result.empty() || + result.find(char(10)) != std::string::npos) { + return false; + } + path.swap(result); + return true; + } + + int + InodeTypeToInt( + InodeType type) + { + return type == kInodeTypeDir ? 1 : + (type == kInodeTypeSymlink ? 2 : 0); + } + + bool + IntToInodeType( + int value, + InodeType& type) + { + if (value == 0) { + type = kInodeTypeFile; + return true; + } + if (value == 1) { + type = kInodeTypeDir; + return true; + } + if (value == 2) { + type = kInodeTypeSymlink; + return true; + } + return false; + } + + const char* + EditLogRecordTypeName( + EditLogRecord::Type type) + { + return type == EditLogRecord::kCreate ? "create" : + (type == EditLogRecord::kRemove ? "remove" : + (type == EditLogRecord::kRename ? "rename" : "invalid")); + } + + int + ValidateEditLogRecord( + const EditLogRecord& record) + { + if (record.txnId == kNoTxn || record.parentFid < 0) { + return -EINVAL; + } + if (record.type == EditLogRecord::kCreate) { + if (record.fid < 0 || ! IsLegalName(record.name) || + (record.inodeType != kInodeTypeFile && + record.inodeType != kInodeTypeDir && + record.inodeType != kInodeTypeSymlink)) { + return -EINVAL; + } + return 0; + } + if (record.type == EditLogRecord::kRemove) { + return IsLegalName(record.name) ? 0 : -EINVAL; + } + if (record.type == EditLogRecord::kRename) { + if (record.fid < 0 || ! IsLegalName(record.name) || + record.newPath.empty() || + record.newPath.find(char(10)) != std::string::npos) { + return -EINVAL; + } + return 0; + } + return -EINVAL; + } +} + +Config::Config() + : enabledFlag(false), + rpcEnabledFlag(false), + dirLargeThreshold(4096), + dirPromoteMaxWallMs(1000), + dirShardCount(128) + {} + + Config +Config::FromProperties( + const Properties& props) +{ + Config cfg; + cfg.enabledFlag = props.getValue( + "metaServer.namespaceV2.enabled", cfg.enabledFlag ? 1 : 0) != 0; + cfg.rpcEnabledFlag = props.getValue( + "metaServer.namespaceV2.rpcEnabled", + cfg.rpcEnabledFlag ? 1 : 0) != 0; + cfg.dirLargeThreshold = std::max(1, props.getValue( + "metaServer.dir.largeThreshold", cfg.dirLargeThreshold)); + cfg.dirPromoteMaxWallMs = std::max(1, props.getValue( + "metaServer.dir.promoteMaxWallMs", cfg.dirPromoteMaxWallMs)); + cfg.dirShardCount = std::max(1, props.getValue( + "metaServer.namespaceV2.dirShardCount", cfg.dirShardCount)); + return cfg; +} + + void +SetParameters( + const Properties& props) +{ + sConfig = Config::FromProperties(props); +} + + const Config& +GetConfig() +{ + return sConfig; +} + + + NamespaceStore& +GetStore() +{ + static NamespaceStore* sStorePtr = 0; + if (! sStorePtr) { + sStorePtr = new NamespaceStore(GetConfig()); + } + return *sStorePtr; +} + +NameKey::NameKey() + : hash(0), + name() + {} + +NameKey::NameKey( + const std::string& inName) + : hash(HashName(inName)), + name(inName) + {} + +NameKey::NameKey( + uint64_t inHash, + const std::string& inName) + : hash(inHash), + name(inName) + {} + + bool +NameKey::operator<( + const NameKey& other) const +{ + return hash < other.hash || (hash == other.hash && name < other.name); +} + + bool +NameKey::operator==( + const NameKey& other) const +{ + return hash == other.hash && name == other.name; +} + + size_t +NameKeyHash::operator()( + const NameKey& key) const +{ + return size_t(key.hash ^ (key.hash >> 33)) ^ + (std::hash()(key.name) << 1); +} + +VersionedDirEntry::VersionedDirEntry() + : childFid(-1), + createTxn(kNoTxn), + deleteTxn(kNoTxn), + pendingFlag(false) + {} + +VersionedDirEntry::VersionedDirEntry( + fid_t childFid, + TxnId createTxn) + : childFid(childFid), + createTxn(createTxn), + deleteTxn(kNoTxn), + pendingFlag(true) + {} + + bool +VersionedDirEntry::IsVisible( + TxnId committedTxn) const +{ + return createTxn <= committedTxn && ! IsDeletedAt(*this, committedTxn); +} + +InodeRecord::InodeRecord() + : fid(-1), + parentFid(-1), + type(kInodeTypeFile), + createTxn(kNoTxn), + deleteTxn(kNoTxn), + pendingFlag(false), + generation(0), + user(kKfsUserRoot), + group(kKfsGroupRoot), + mode(0), + numReplicas(1), + mtime(0), + ctime(0), + atime(0) + {} + +InodeRecord::InodeRecord( + fid_t inFid, + fid_t inParentFid, + InodeType inType, + TxnId inCreateTxn, + kfsUid_t inUser, + kfsGid_t inGroup, + kfsMode_t inMode, + int16_t inNumReplicas, + int64_t inMtime) + : fid(inFid), + parentFid(inParentFid), + type(inType), + createTxn(inCreateTxn), + deleteTxn(kNoTxn), + pendingFlag(true), + generation(0), + user(inUser), + group(inGroup), + mode(inMode), + numReplicas(inNumReplicas), + mtime(inMtime), + ctime(inMtime), + atime(inMtime) + {} + + bool +InodeRecord::IsVisible( + TxnId committedTxn) const +{ + return createTxn <= committedTxn && + (deleteTxn == kNoTxn || committedTxn < deleteTxn); +} + + InodeTable::InodeTable() + : mTables(kNamespaceV2ShardCount) + {} + + bool +InodeTable::Insert( + const InodeRecord& record) +{ + Table& table = mTables[GetLockShard(record.fid)]; + return table.insert(std::make_pair(record.fid, record)).second; +} + + InodeRecord* +InodeTable::Find( + fid_t fid) +{ + Table& table = mTables[GetLockShard(fid)]; + Table::iterator const it = table.find(fid); + return it == table.end() ? 0 : &it->second; +} + + const InodeRecord* +InodeTable::Find( + fid_t fid) const +{ + const Table& table = mTables[GetLockShard(fid)]; + Table::const_iterator const it = table.find(fid); + return it == table.end() ? 0 : &it->second; +} + + const InodeRecord* +InodeTable::FindCommitted( + fid_t fid, + TxnId committedTxn) const +{ + const InodeRecord* const record = Find(fid); + return record && record->IsVisible(committedTxn) ? record : 0; +} + + bool +InodeTable::MarkDeleted( + fid_t fid, + TxnId deleteTxn) +{ + InodeRecord* const record = Find(fid); + if (! record || record->deleteTxn != kNoTxn) { + return false; + } + record->deleteTxn = deleteTxn; + record->pendingFlag = true; + record->generation++; + return true; +} + + bool +InodeTable::Move( + fid_t fid, + fid_t parentFid) +{ + InodeRecord* const record = Find(fid); + if (! record || record->deleteTxn != kNoTxn) { + return false; + } + record->parentFid = parentFid; + record->generation++; + return true; +} + + void +InodeTable::GetCommitted( + TxnId committedTxn, + std::vector& records) const +{ + records.clear(); + size_t size = 0; + for (Tables::const_iterator tableIt = mTables.begin(); + tableIt != mTables.end(); + ++tableIt) { + size += tableIt->size(); + } + records.reserve(size); + for (Tables::const_iterator tableIt = mTables.begin(); + tableIt != mTables.end(); + ++tableIt) { + for (Table::const_iterator it = tableIt->begin(); + it != tableIt->end(); + ++it) { + if (it->second.IsVisible(committedTxn)) { + records.push_back(it->second); + } + } + } + std::sort(records.begin(), records.end(), + [](const InodeRecord& lhs, const InodeRecord& rhs) { + return lhs.fid < rhs.fid; + }); +} + + void +InodeTable::CommitThrough( + TxnId committedTxn) +{ + for (Tables::iterator tableIt = mTables.begin(); + tableIt != mTables.end(); + ++tableIt) { + for (Table::iterator it = tableIt->begin(); + it != tableIt->end(); + ++it) { + if (it->second.createTxn <= committedTxn && + (it->second.deleteTxn == kNoTxn || + it->second.deleteTxn <= committedTxn)) { + it->second.pendingFlag = false; + } + } + } +} + + size_t +InodeTable::Size() const +{ + size_t ret = 0; + for (Tables::const_iterator it = mTables.begin(); + it != mTables.end(); + ++it) { + ret += it->size(); + } + return ret; +} + +ReaddirCookie::ReaddirCookie() + : generation(0), + layout(kDirStateSmall), + hasLastKeyFlag(false), + lastKey() + {} + +ReaddirResult::Entry::Entry( + const NameKey& inKey, + fid_t inChildFid) + : key(inKey), + childFid(inChildFid) + {} + +ReaddirResult::ReaddirResult() + : entries(), + moreEntriesFlag(false), + nextCookie() + {} + +CheckpointDirEntry::CheckpointDirEntry( + fid_t inParentFid, + const NameKey& inKey, + fid_t inChildFid) + : parentFid(inParentFid), + key(inKey), + childFid(inChildFid) + {} + +DirNode::DirNode( + int largeThreshold, + int promoteMaxWallMs) + : mState(kDirStateSmall), + mGeneration(0), + mLargeThreshold(std::max(1, largeThreshold)), + mPromoteMaxWallMs(std::max(1, promoteMaxWallMs)), + mChildCount(0), + mSmall(), + mLarge() + {} + + DirState +DirNode::GetCookieLayout() const +{ + return mState == kDirStateLarge ? kDirStateLarge : kDirStateSmall; +} + + DirState +DirNode::GetState() const +{ + return mState; +} + + uint64_t +DirNode::GetGeneration() const +{ + return mGeneration; +} + + void +DirNode::SetGeneration( + uint64_t generation) +{ + mGeneration = generation; +} + + size_t +DirNode::GetChildCount() const +{ + return mChildCount; +} + + bool +DirNode::IsLarge() const +{ + return mState == kDirStateLarge; +} + + bool +DirNode::HasVisibleOrPendingName( + const NameKey& key, + TxnId committedTxn) const +{ + return IsVisibleOrPendingEntry(Find(key), committedTxn); +} + + bool +DirNode::HasVisibleOrPendingName( + const std::string& name, + TxnId committedTxn) const +{ + return HasVisibleOrPendingName(NameKey(name), committedTxn); +} + + int +DirNode::InsertPending( + const std::string& name, + fid_t childFid, + TxnId txnId, + TxnId committedTxn, + bool replaceDeletedFlag) +{ + if (! IsLegalName(name)) { + return -EINVAL; + } + return InsertPending(NameKey(name), childFid, txnId, committedTxn, + replaceDeletedFlag); +} + + int +DirNode::InsertPending( + const NameKey& key, + fid_t childFid, + TxnId txnId, + TxnId committedTxn, + bool replaceDeletedFlag) +{ + if (txnId == kNoTxn || childFid < 0) { + return -EINVAL; + } + if (mState == kDirStatePromoting) { + return -EBUSY; + } + VersionedDirEntry* const oldEntry = FindMutable(key); + const bool replaceFlag = replaceDeletedFlag && oldEntry && + oldEntry->deleteTxn == txnId; + if (! replaceFlag && + IsVisibleOrPendingEntry(oldEntry, committedTxn)) { + return -EEXIST; + } + if (mState == kDirStateSmall && ! replaceFlag && + (int)(mSmall.size() + 1) > mLargeThreshold) { + const int status = Promote(); + if (status != 0) { + return status; + } + } + VersionedDirEntry entry(childFid, txnId); + if (mState == kDirStateLarge) { + mLarge[key] = entry; + } else { + mSmall[key] = entry; + IncrementSmallGeneration(); + } + mChildCount++; + return 0; +} + + int +DirNode::InsertCommitted( + const std::string& name, + fid_t childFid) +{ + if (childFid < 0 || ! IsLegalName(name)) { + return -EINVAL; + } + const NameKey key(name); + if (Find(key)) { + return -EEXIST; + } + if (mState == kDirStateSmall && + (int)(mSmall.size() + 1) > mLargeThreshold) { + const int status = Promote(); + if (status != 0) { + return status; + } + } + VersionedDirEntry entry(childFid, kNoTxn); + entry.pendingFlag = false; + if (mState == kDirStateLarge) { + mLarge[key] = entry; + } else { + mSmall[key] = entry; + IncrementSmallGeneration(); + } + mChildCount++; + return 0; +} + + void +DirNode::GetCommittedEntries( + TxnId committedTxn, + std::vector& entries) const +{ + if (mState == kDirStateLarge) { + for (LargeEntries::const_iterator it = mLarge.begin(); + it != mLarge.end(); + ++it) { + if (it->second.IsVisible(committedTxn)) { + entries.push_back(ReaddirResult::Entry( + it->first, it->second.childFid)); + } + } + } else { + std::vector sorted; + sorted.reserve(mSmall.size()); + for (SmallEntries::const_iterator it = mSmall.begin(); + it != mSmall.end(); + ++it) { + if (it->second.IsVisible(committedTxn)) { + sorted.push_back(ReaddirResult::Entry( + it->first, it->second.childFid)); + } + } + std::sort(sorted.begin(), sorted.end(), + [](const ReaddirResult::Entry& lhs, + const ReaddirResult::Entry& rhs) { + return lhs.key < rhs.key; + }); + entries.insert(entries.end(), sorted.begin(), sorted.end()); + } +} + + int +DirNode::DeletePending( + const std::string& name, + TxnId txnId, + fid_t* childFidPtr) +{ + if (txnId == kNoTxn) { + return -EINVAL; + } + VersionedDirEntry* const entry = FindMutable(NameKey(name)); + if (! entry || entry->deleteTxn != kNoTxn) { + return -ENOENT; + } + entry->deleteTxn = txnId; + entry->pendingFlag = true; + if (childFidPtr) { + *childFidPtr = entry->childFid; + } + if (mChildCount > 0) { + mChildCount--; + } + if (mState != kDirStateLarge) { + IncrementSmallGeneration(); + } + return 0; +} + + const VersionedDirEntry* +DirNode::LookupCommitted( + const std::string& name, + TxnId committedTxn) const +{ + const VersionedDirEntry* const entry = Find(NameKey(name)); + return entry && entry->IsVisible(committedTxn) ? entry : 0; +} + + int +DirNode::ReaddirCommitted( + TxnId committedTxn, + const ReaddirCookie* cookiePtr, + size_t maxEntries, + ReaddirResult& result) const +{ + result = ReaddirResult(); + if (maxEntries == 0) { + return 0; + } + const DirState cookieLayout = GetCookieLayout(); + if (cookiePtr) { + if (cookiePtr->generation != mGeneration || + cookiePtr->layout != cookieLayout) { + return -EINVAL; + } + } + result.nextCookie.generation = mGeneration; + result.nextCookie.layout = cookieLayout; + const bool hasLastKeyFlag = cookiePtr && cookiePtr->hasLastKeyFlag; + const NameKey lastKey = hasLastKeyFlag ? cookiePtr->lastKey : NameKey(); + const SmallEntries* const entriesMap = + mState == kDirStateLarge ? 0 : &mSmall; + const LargeEntries* const largeMap = + mState == kDirStateLarge ? &mLarge : 0; + std::vector > entries; + if (largeMap) { + entries.reserve(largeMap->size()); + for (LargeEntries::const_iterator it = largeMap->begin(); + it != largeMap->end(); + ++it) { + if (it->second.IsVisible(committedTxn) && + (! hasLastKeyFlag || lastKey < it->first)) { + entries.push_back(*it); + } + } + } else if (entriesMap) { + entries.reserve(entriesMap->size()); + for (SmallEntries::const_iterator it = entriesMap->begin(); + it != entriesMap->end(); + ++it) { + if (it->second.IsVisible(committedTxn) && + (! hasLastKeyFlag || lastKey < it->first)) { + entries.push_back(*it); + } + } + } + std::sort(entries.begin(), entries.end(), + [](const std::pair& lhs, + const std::pair& rhs) { + return lhs.first < rhs.first; + }); + for (std::vector >::const_iterator + it = entries.begin(); + it != entries.end(); + ++it) { + if (result.entries.size() >= maxEntries) { + result.moreEntriesFlag = true; + break; + } + result.entries.push_back(ReaddirResult::Entry( + it->first, it->second.childFid)); + } + if (! result.entries.empty()) { + result.nextCookie.hasLastKeyFlag = true; + result.nextCookie.lastKey = result.entries.back().key; + } + return 0; +} + + void +DirNode::CommitThrough( + TxnId committedTxn) +{ + if (mState == kDirStateLarge) { + for (LargeEntries::iterator it = mLarge.begin(); + it != mLarge.end(); + ++it) { + if (it->second.createTxn <= committedTxn && + (it->second.deleteTxn == kNoTxn || + it->second.deleteTxn <= committedTxn)) { + it->second.pendingFlag = false; + } + } + } else { + for (SmallEntries::iterator it = mSmall.begin(); + it != mSmall.end(); + ++it) { + if (it->second.createTxn <= committedTxn && + (it->second.deleteTxn == kNoTxn || + it->second.deleteTxn <= committedTxn)) { + it->second.pendingFlag = false; + } + } + } +} + + int +DirNode::Promote() +{ + if (mState == kDirStateLarge) { + return 0; + } + if (mState != kDirStateSmall) { + return -EINVAL; + } + enum { kPromoteBatchSize = 512 }; + mState = kDirStatePromoting; + const int64_t deadlineUsec = microseconds() + + (int64_t)mPromoteMaxWallMs * 1000; + LargeEntries staging; + staging.reserve(mSmall.size()); + SmallEntries::const_iterator it = mSmall.begin(); + while (it != mSmall.end()) { + for (size_t batch = 0; + batch < kPromoteBatchSize && it != mSmall.end(); + ++batch, ++it) { + staging.insert(*it); + } + if (it != mSmall.end() && microseconds() > deadlineUsec) { + mState = kDirStateSmall; + return -EBUSY; + } + } + mLarge.swap(staging); + mSmall.clear(); + mState = kDirStateLarge; + mGeneration++; + return 0; +} + + VersionedDirEntry* +DirNode::FindMutable( + const NameKey& key) +{ + if (mState == kDirStateLarge) { + LargeEntries::iterator const it = mLarge.find(key); + return it == mLarge.end() ? 0 : &it->second; + } + SmallEntries::iterator const it = mSmall.find(key); + return it == mSmall.end() ? 0 : &it->second; +} + + const VersionedDirEntry* +DirNode::Find( + const NameKey& key) const +{ + if (mState == kDirStateLarge) { + LargeEntries::const_iterator const it = mLarge.find(key); + return it == mLarge.end() ? 0 : &it->second; + } + SmallEntries::const_iterator const it = mSmall.find(key); + return it == mSmall.end() ? 0 : &it->second; +} + + void +DirNode::IncrementSmallGeneration() +{ + if (mState != kDirStateLarge) { + mGeneration++; + } +} + + DirTable::DirTable() + : mTables(kNamespaceV2ShardCount) + {} + + bool +DirTable::Insert( + fid_t dirFid, + const DirNode& dir) +{ + Table& table = mTables[GetLockShard(dirFid)]; + return table.insert(std::make_pair(dirFid, dir)).second; +} + + DirNode* +DirTable::Find( + fid_t dirFid) +{ + Table& table = mTables[GetLockShard(dirFid)]; + Table::iterator const it = table.find(dirFid); + return it == table.end() ? 0 : &it->second; +} + + const DirNode* +DirTable::Find( + fid_t dirFid) const +{ + const Table& table = mTables[GetLockShard(dirFid)]; + Table::const_iterator const it = table.find(dirFid); + return it == table.end() ? 0 : &it->second; +} + + void +DirTable::CommitThrough( + TxnId committedTxn) +{ + for (Tables::iterator tableIt = mTables.begin(); + tableIt != mTables.end(); + ++tableIt) { + for (Table::iterator it = tableIt->begin(); + it != tableIt->end(); + ++it) { + it->second.CommitThrough(committedTxn); + } + } +} + + size_t +DirTable::Size() const +{ + size_t ret = 0; + for (Tables::const_iterator it = mTables.begin(); + it != mTables.end(); + ++it) { + ret += it->size(); + } + return ret; +} + + void +DirTable::GetCommittedEntries( + TxnId committedTxn, + std::vector& entries) const +{ + entries.clear(); + std::vector dirFids; + dirFids.reserve(Size()); + for (Tables::const_iterator tableIt = mTables.begin(); + tableIt != mTables.end(); + ++tableIt) { + for (Table::const_iterator it = tableIt->begin(); + it != tableIt->end(); + ++it) { + dirFids.push_back(it->first); + } + } + std::sort(dirFids.begin(), dirFids.end()); + for (std::vector::const_iterator it = dirFids.begin(); + it != dirFids.end(); + ++it) { + const DirNode* const dir = Find(*it); + if (! dir) { + continue; + } + std::vector dirEntries; + dir->GetCommittedEntries(committedTxn, dirEntries); + for (std::vector::const_iterator entryIt = + dirEntries.begin(); + entryIt != dirEntries.end(); + ++entryIt) { + entries.push_back(CheckpointDirEntry( + *it, entryIt->key, entryIt->childFid)); + } + } +} + + void +DirTable::GetDirGenerations( + std::vector >& generations) const +{ + generations.clear(); + generations.reserve(Size()); + for (Tables::const_iterator tableIt = mTables.begin(); + tableIt != mTables.end(); + ++tableIt) { + for (Table::const_iterator it = tableIt->begin(); + it != tableIt->end(); + ++it) { + generations.push_back(std::make_pair( + it->first, it->second.GetGeneration())); + } + } + std::sort(generations.begin(), generations.end(), + [](const std::pair& lhs, + const std::pair& rhs) { + return lhs.first < rhs.first; + }); +} + +LookupResult::LookupResult() + : fid(-1), + type(kInodeTypeFile), + parentGeneration(0), + user(kKfsUserRoot), + group(kKfsGroupRoot), + mode(0), + numReplicas(1), + mtime(0), + ctime(0), + atime(0), + fileCount(0), + dirCount(0) + {} + +CreateResult::CreateResult() + : fid(-1), + txnId(kNoTxn), + parentGeneration(0) + {} + +EditLogRecord::EditLogRecord() + : type(kInvalid), + txnId(kNoTxn), + parentFid(-1), + name(), + fid(-1), + inodeType(kInodeTypeFile), + user(kKfsUserRoot), + group(kKfsGroupRoot), + mode(0), + numReplicas(1), + mtime(0), + newPath(), + overwriteFlag(false) + {} + + int +WriteEditLog( + std::ostream& os, + const EditLogRecord& record) +{ + const int status = ValidateEditLogRecord(record); + if (status != 0) { + return status; + } + os << "namespacev2_edit 1 " << EditLogRecordTypeName(record.type) << " "; + if (record.type == EditLogRecord::kCreate) { + os << record.txnId << " " << record.parentFid << " " << + record.fid << " " << InodeTypeToInt(record.inodeType) << + " " << record.user << " " << record.group << " " << + record.mode << " " << record.numReplicas << " " << + record.mtime << " " << EncodeName(record.name); + } else if (record.type == EditLogRecord::kRemove) { + os << record.txnId << " " << record.parentFid << " " << + InodeTypeToInt(record.inodeType) << " " << + EncodeName(record.name); + } else if (record.type == EditLogRecord::kRename) { + os << record.txnId << " " << record.parentFid << " " << + record.fid << " " << (record.overwriteFlag ? 1 : 0) << + " " << EncodeName(record.name) << " " << + EncodeString(record.newPath); + } + os << char(10); + return os.good() ? 0 : -EIO; +} + + int +ReadEditLog( + const std::string& line, + EditLogRecord& record) +{ + std::istringstream is(line); + std::string magic; + int version = 0; + std::string op; + if (! (is >> magic >> version >> op) || + magic != "namespacev2_edit" || version != 1) { + return -EINVAL; + } + EditLogRecord tmp; + if (op == "create") { + int typeValue = -1; + std::string encodedName; + if (! (is >> tmp.txnId >> tmp.parentFid >> tmp.fid >> + typeValue >> tmp.user >> tmp.group >> tmp.mode >> + tmp.numReplicas >> tmp.mtime >> encodedName) || + ! IntToInodeType(typeValue, tmp.inodeType) || + ! DecodeName(encodedName, tmp.name)) { + return -EINVAL; + } + tmp.type = EditLogRecord::kCreate; + } else if (op == "remove") { + int typeValue = -1; + std::string encodedName; + if (! (is >> tmp.txnId >> tmp.parentFid >> typeValue >> + encodedName) || + ! IntToInodeType(typeValue, tmp.inodeType) || + ! DecodeName(encodedName, tmp.name)) { + return -EINVAL; + } + tmp.type = EditLogRecord::kRemove; + } else if (op == "rename") { + int overwrite = 0; + std::string encodedName; + std::string encodedPath; + if (! (is >> tmp.txnId >> tmp.parentFid >> tmp.fid >> + overwrite >> encodedName >> encodedPath) || + ! DecodeName(encodedName, tmp.name) || + ! DecodePath(encodedPath, tmp.newPath)) { + return -EINVAL; + } + tmp.type = EditLogRecord::kRename; + tmp.overwriteFlag = overwrite != 0; + } else { + return -EINVAL; + } + std::string extra; + if (is >> extra) { + return -EINVAL; + } + const int status = ValidateEditLogRecord(tmp); + if (status != 0) { + return status; + } + record = tmp; + return 0; +} + +NamespaceStore::NamespaceStore( + const Config& inConfig, + fid_t rootFid) + : mConfig(inConfig), + mRootFid(rootFid), + mNextFid(rootFid + 1), + mNextTxn(kNoTxn), + mCommittedTxn(kNoTxn), + mInodes(), + mDirs() +{ + InodeRecord root; + root.fid = rootFid; + root.parentFid = rootFid; + root.type = kInodeTypeDir; + root.createTxn = kNoTxn; + root.deleteTxn = kNoTxn; + root.pendingFlag = false; + root.mode = 0777; + mInodes.Insert(root); + mDirs.Insert(rootFid, DirNode( + mConfig.dirLargeThreshold, mConfig.dirPromoteMaxWallMs)); +} + + fid_t +NamespaceStore::GetRootFid() const +{ + return mRootFid; +} + + TxnId +NamespaceStore::GetCommittedTxn() const +{ + return GetCommittedTxnSnapshot(); +} + + TxnId +NamespaceStore::GetLastTxn() const +{ + return GetLastTxnSnapshot(); +} + + size_t +NamespaceStore::GetInodeCount() const +{ + std::vector locks; + AddAllInodeShardMutexes(locks); + ScopedMutexGroup locker(locks); + return mInodes.Size(); +} + + size_t +NamespaceStore::GetDirCount() const +{ + std::vector locks; + AddAllDirShardMutexes(locks); + ScopedMutexGroup locker(locks); + return mDirs.Size(); +} + + TxnId +NamespaceStore::GetCommittedTxnSnapshot() const +{ + return mCommittedTxn.load(std::memory_order_acquire); +} + + TxnId +NamespaceStore::GetLastTxnSnapshot() const +{ + ScopedMutex locker(GetTxnMutex()); + return mNextTxn; +} + + void +NamespaceStore::ReserveCreateIds( + fid_t& childFid, + TxnId& txnId) +{ + AllocateCreateIds(childFid, txnId); +} + + void +NamespaceStore::ReserveCreateIdsRange( + size_t count, + fid_t& firstFid, + TxnId& firstTxn) +{ + if (count == 0) { + firstFid = -1; + firstTxn = 0; + return; + } + ScopedMutex locker(GetTxnMutex()); + firstFid = mNextFid; + firstTxn = mNextTxn + 1; + mNextFid = firstFid + (fid_t)count; + mNextTxn = firstTxn + (TxnId)count - 1; +} + + void +NamespaceStore::AllocateCreateIds( + fid_t& childFid, + TxnId& txnId) +{ + ScopedMutex locker(GetTxnMutex()); + childFid = mNextFid; + txnId = mNextTxn + 1; + mNextFid = childFid + 1; + mNextTxn = txnId; +} + + TxnId +NamespaceStore::AllocateTxnId() +{ + ScopedMutex locker(GetTxnMutex()); + const TxnId txnId = mNextTxn + 1; + mNextTxn = txnId; + return txnId; +} + + void +NamespaceStore::AdvanceSeeds( + fid_t fid, + TxnId txnId) +{ + ScopedMutex locker(GetTxnMutex()); + if (fid >= 0) { + mNextFid = std::max(mNextFid, fid + 1); + } + mNextTxn = std::max(mNextTxn, txnId); +} + + int +NamespaceStore::Create( + fid_t parentFid, + const std::string& name, + InodeType type, + CreateResult* resultPtr, + kfsUid_t user, + kfsGid_t group, + kfsMode_t mode, + int16_t numReplicas, + int64_t mtime) +{ + return ApplyCreatePending(parentFid, name, type, resultPtr, + user, group, mode, numReplicas, mtime); +} + + int +NamespaceStore::ApplyCreatePending( + fid_t parentFid, + const std::string& name, + InodeType type, + CreateResult* resultPtr, + kfsUid_t user, + kfsGid_t group, + kfsMode_t mode, + int16_t numReplicas, + int64_t mtime) +{ + int status = ValidateCreateRequest(name, type); + if (status != 0) { + return status; + } + const NameKey key(name); + { + ScopedSmallMutexGroup parentLocker( + &GetDirShardMutex(parentFid), + &GetInodeShardMutex(parentFid)); + status = CheckCreateParentName( + parentFid, key, GetCommittedTxnSnapshot()); + if (status != 0) { + return status; + } + } + fid_t childFid = -1; + TxnId txnId = kNoTxn; + ReserveCreateIds(childFid, txnId); + status = ApplyCreate(parentFid, name, type, childFid, txnId, + user, group, mode, numReplicas, mtime, false, true); + if (status == 0 && resultPtr) { + resultPtr->fid = childFid; + resultPtr->txnId = txnId; + const DirNode* const dir = mDirs.Find(parentFid); + resultPtr->parentGeneration = dir ? dir->GetGeneration() : 0; + } + return status; +} + + + int +NamespaceStore::CreateSelf( + fid_t parentFid, + const std::string& name, + InodeType type, + fid_t childFid, + TxnId txnId, + CreateResult* resultPtr, + kfsUid_t user, + kfsGid_t group, + kfsMode_t mode, + int16_t numReplicas, + int64_t mtime, + TxnId committedTxn) +{ + int status = ValidateCreateRequest( + name, type, childFid, txnId, committedTxn); + if (status != 0) { + return status; + } + return CreateSelf(parentFid, NameKey(name), type, childFid, txnId, + resultPtr, user, group, mode, numReplicas, mtime, committedTxn); +} + + int +NamespaceStore::CreateSelf( + fid_t parentFid, + const NameKey& key, + InodeType type, + fid_t childFid, + TxnId txnId, + CreateResult* resultPtr, + kfsUid_t user, + kfsGid_t group, + kfsMode_t mode, + int16_t numReplicas, + int64_t mtime, + TxnId committedTxn) +{ + if (childFid < 0 || txnId == kNoTxn || txnId <= committedTxn || + ! IsSupportedInodeType(type)) { + return -EINVAL; + } + if (mInodes.Find(childFid)) { + return -EEXIST; + } + DirNode* dir = 0; + int status = ResolveCreateParentDir(parentFid, committedTxn, dir); + if (status != 0) { + return status; + } + status = dir->InsertPending(key, childFid, txnId, committedTxn); + if (status != 0) { + return status; + } + if (! mInodes.Insert(InodeRecord(childFid, parentFid, type, txnId, + user, group, mode, numReplicas, mtime))) { + return -EEXIST; + } + if (type == kInodeTypeDir && + ! mDirs.Insert(childFid, DirNode( + mConfig.dirLargeThreshold, mConfig.dirPromoteMaxWallMs))) { + return -EEXIST; + } + if (resultPtr) { + resultPtr->fid = childFid; + resultPtr->txnId = txnId; + resultPtr->parentGeneration = dir->GetGeneration(); + } + return 0; +} + + int +NamespaceStore::CreateSelfTrusted( + fid_t parentFid, + const NameKey& key, + InodeType type, + fid_t childFid, + TxnId txnId, + CreateResult* resultPtr, + kfsUid_t user, + kfsGid_t group, + kfsMode_t mode, + int16_t numReplicas, + int64_t mtime, + TxnId committedTxn) +{ + if (childFid < 0 || txnId == kNoTxn || txnId <= committedTxn || + ! IsSupportedInodeType(type)) { + return -EINVAL; + } + if (mInodes.Find(childFid)) { + return -EEXIST; + } + DirNode* const dir = mDirs.Find(parentFid); + if (! dir) { + return -ENOENT; + } + int status = dir->InsertPending(key, childFid, txnId, committedTxn); + if (status != 0) { + return status; + } + if (! mInodes.Insert(InodeRecord(childFid, parentFid, type, txnId, + user, group, mode, numReplicas, mtime))) { + return -EEXIST; + } + if (type == kInodeTypeDir && + ! mDirs.Insert(childFid, DirNode( + mConfig.dirLargeThreshold, mConfig.dirPromoteMaxWallMs))) { + return -EEXIST; + } + if (resultPtr) { + resultPtr->fid = childFid; + resultPtr->txnId = txnId; + resultPtr->parentGeneration = dir->GetGeneration(); + } + return 0; +} + + int +NamespaceStore::Lookup( + fid_t parentFid, + const std::string& name, + LookupResult& result) const +{ + result = LookupResult(); + if (parentFid == mRootFid && name == "/") { + return GetAttr(mRootFid, result); + } + + for (int retry = 0; retry < 4; ++retry) { + fid_t childFid = -1; + { + ScopedSmallMutexGroup parentLocker( + &GetDirShardMutex(parentFid), + &GetInodeShardMutex(parentFid)); + const TxnId committedTxn = GetCommittedTxnSnapshot(); + if (! FindCommittedDir(parentFid, committedTxn)) { + return -ENOENT; + } + const DirNode* const dir = mDirs.Find(parentFid); + if (! dir) { + return -ENOENT; + } + const VersionedDirEntry* const entry = + dir->LookupCommitted(name, committedTxn); + if (! entry) { + return -ENOENT; + } + childFid = entry->childFid; + } + + ScopedSmallMutexGroup locker( + &GetDirShardMutex(parentFid), + &GetInodeShardMutex(parentFid), + &GetInodeShardMutex(childFid), + &GetDirShardMutex(childFid)); + const TxnId committedTxn = GetCommittedTxnSnapshot(); + if (! FindCommittedDir(parentFid, committedTxn)) { + return -ENOENT; + } + const DirNode* const dir = mDirs.Find(parentFid); + if (! dir) { + return -ENOENT; + } + const VersionedDirEntry* const entry = + dir->LookupCommitted(name, committedTxn); + if (! entry) { + return -ENOENT; + } + if (entry->childFid != childFid) { + continue; + } + const InodeRecord* const inode = + mInodes.FindCommitted(childFid, committedTxn); + if (! inode) { + return -ENOENT; + } + return FillLookupResult(*inode, dir->GetGeneration(), result); + } + return -EAGAIN; +} + + int +NamespaceStore::LookupPath( + fid_t rootFid, + const std::string& path, + LookupResult& result) const +{ + result = LookupResult(); + if (path.empty()) { + return -EINVAL; + } + fid_t curFid = (! path.empty() && path[0] == '/') ? mRootFid : rootFid; + int status = GetAttr(curFid, result); + if (status != 0) { + return status; + } + size_t pos = 0; + while (pos < path.size()) { + while (pos < path.size() && path[pos] == '/') { + ++pos; + } + if (pos >= path.size()) { + break; + } + const size_t start = pos; + while (pos < path.size() && path[pos] != '/') { + ++pos; + } + const std::string name = path.substr(start, pos - start); + if (name == ".") { + continue; + } + if (result.type != kInodeTypeDir) { + return -ENOTDIR; + } + status = Lookup(curFid, name, result); + if (status != 0) { + return status; + } + curFid = result.fid; + } + return 0; +} + + int +NamespaceStore::GetAttr( + fid_t fid, + LookupResult& result) const +{ + result = LookupResult(); + for (int retry = 0; retry < 4; ++retry) { + fid_t parentFid = -1; + { + std::vector inodeLocks; + AddInodeShardMutex(inodeLocks, fid); + ScopedMutexGroup inodeLocker(inodeLocks); + const TxnId committedTxn = GetCommittedTxnSnapshot(); + const InodeRecord* const inode = + mInodes.FindCommitted(fid, committedTxn); + if (! inode) { + return -ENOENT; + } + parentFid = inode->parentFid; + } + + std::vector locks; + AddInodeShardMutex(locks, fid); + AddDirShardMutex(locks, fid); + AddDirShardMutex(locks, parentFid); + ScopedMutexGroup locker(locks); + const TxnId committedTxn = GetCommittedTxnSnapshot(); + const InodeRecord* const inode = + mInodes.FindCommitted(fid, committedTxn); + if (! inode) { + return -ENOENT; + } + if (inode->parentFid != parentFid) { + continue; + } + uint64_t parentGeneration = 0; + const DirNode* const parentDir = mDirs.Find(parentFid); + if (parentDir) { + parentGeneration = parentDir->GetGeneration(); + } + return FillLookupResult(*inode, parentGeneration, result); + } + return -EAGAIN; +} + + int +NamespaceStore::Readdir( + fid_t dirFid, + const ReaddirCookie* cookiePtr, + size_t maxEntries, + ReaddirResult& result) const +{ + std::vector locks; + AddDirShardMutex(locks, dirFid); + AddInodeShardMutex(locks, dirFid); + ScopedMutexGroup locker(locks); + const TxnId committedTxn = GetCommittedTxnSnapshot(); + if (! FindCommittedDir(dirFid, committedTxn)) { + return -ENOENT; + } + const DirNode* const dir = mDirs.Find(dirFid); + return dir ? dir->ReaddirCommitted(committedTxn, cookiePtr, + maxEntries, result) : -ENOENT; +} + + int +NamespaceStore::ReaddirFromName( + fid_t dirFid, + const std::string& name, + size_t maxEntries, + ReaddirResult& result) const +{ + std::vector locks; + AddDirShardMutex(locks, dirFid); + AddInodeShardMutex(locks, dirFid); + ScopedMutexGroup locker(locks); + const TxnId committedTxn = GetCommittedTxnSnapshot(); + if (! FindCommittedDir(dirFid, committedTxn)) { + return -ENOENT; + } + const DirNode* const dir = mDirs.Find(dirFid); + if (! dir) { + return -ENOENT; + } + ReaddirCookie cookie; + cookie.generation = dir->GetGeneration(); + cookie.layout = dir->GetState(); + cookie.hasLastKeyFlag = ! name.empty(); + cookie.lastKey = NameKey(name); + return dir->ReaddirCommitted(committedTxn, + cookie.hasLastKeyFlag ? &cookie : 0, maxEntries, result); +} + + int +NamespaceStore::Remove( + fid_t parentFid, + const std::string& name, + TxnId* txnIdPtr) +{ + return RemoveSelf(parentFid, name, kInodeTypeFile, false, kNoTxn, + txnIdPtr); +} + + int +NamespaceStore::RemoveFile( + fid_t parentFid, + const std::string& name, + TxnId* txnIdPtr) +{ + return RemoveSelf(parentFid, name, kInodeTypeFile, false, kNoTxn, + txnIdPtr); +} + + int +NamespaceStore::Rmdir( + fid_t parentFid, + const std::string& name, + TxnId* txnIdPtr) +{ + return RemoveSelf(parentFid, name, kInodeTypeDir, true, kNoTxn, + txnIdPtr); +} + + int +NamespaceStore::ResolveRenameTarget( + fid_t baseDirFid, + const std::string& newPath, + fid_t& dstParentFid, + std::string& dstName) const +{ + dstParentFid = -1; + dstName.clear(); + if (newPath.empty() || newPath[newPath.size() - 1] == '/') { + return -EINVAL; + } + const std::string::size_type rslash = newPath.rfind('/'); + if (rslash == std::string::npos) { + dstParentFid = baseDirFid; + dstName = newPath; + return 0; + } + LookupResult parent; + const int status = LookupPath(baseDirFid, + newPath.substr(0, std::max(size_t(1), rslash)), parent); + if (status != 0) { + return status; + } + if (parent.type != kInodeTypeDir) { + return -ENOTDIR; + } + dstParentFid = parent.fid; + dstName = newPath.substr(rslash + 1); + return dstName.empty() ? -EINVAL : 0; +} + + bool +NamespaceStore::IsDescendant( + fid_t ancestorFid, + fid_t dirFid, + TxnId committedTxn) const +{ + fid_t curFid = dirFid; + for (size_t guard = 0; guard <= mInodes.Size(); ++guard) { + if (curFid == ancestorFid) { + return true; + } + if (curFid == mRootFid) { + return false; + } + const InodeRecord* const inode = + mInodes.FindCommitted(curFid, committedTxn); + if (! inode || inode->parentFid == curFid) { + return false; + } + curFid = inode->parentFid; + } + return false; +} + + int +NamespaceStore::Rename( + fid_t parentFid, + const std::string& oldName, + const std::string& newPath, + bool overwriteFlag, + TxnId* txnIdPtr, + fid_t* srcFidPtr) +{ + return RenameSelf(parentFid, oldName, newPath, overwriteFlag, + kNoTxn, txnIdPtr, srcFidPtr); +} + + int +NamespaceStore::RenameSelf( + fid_t parentFid, + const std::string& oldName, + const std::string& newPath, + bool overwriteFlag, + TxnId txnId, + TxnId* txnIdPtr, + fid_t* srcFidPtr) +{ + if (txnIdPtr) { + *txnIdPtr = kNoTxn; + } + if (srcFidPtr) { + *srcFidPtr = -1; + } + if (! IsLegalName(oldName)) { + return -EINVAL; + } + + fid_t dstParentFid = -1; + std::string dstName; + int status = ResolveRenameTarget(parentFid, newPath, + dstParentFid, dstName); + if (status != 0) { + return status; + } + if (! IsLegalName(dstName)) { + return -EINVAL; + } + + for (int retry = 0; retry < 4; ++retry) { + fid_t phaseDstFid = -1; + InodeType phaseDstType = kInodeTypeFile; + bool phaseDstExistsFlag = false; + { + std::vector phaseLocks; + AddAllInodeShardMutexes(phaseLocks); + AddDirShardMutex(phaseLocks, parentFid); + AddDirShardMutex(phaseLocks, dstParentFid); + ScopedMutexGroup phaseLocker(phaseLocks); + const TxnId committedTxn = GetCommittedTxnSnapshot(); + if (txnId != kNoTxn && txnId <= committedTxn) { + return -EINVAL; + } + if (! FindCommittedDir(parentFid, committedTxn)) { + return -ENOENT; + } + DirNode* const srcDir = mDirs.Find(parentFid); + if (! srcDir) { + return -ENOENT; + } + const VersionedDirEntry* const srcEntry = + srcDir->LookupCommitted(oldName, committedTxn); + if (! srcEntry) { + return -ENOENT; + } + const InodeRecord* const srcInode = + mInodes.FindCommitted(srcEntry->childFid, committedTxn); + if (! srcInode) { + return -ENOENT; + } + if (srcFidPtr) { + *srcFidPtr = srcInode->fid; + } + if (parentFid == dstParentFid && oldName == dstName) { + return 0; + } + if (! FindCommittedDir(dstParentFid, committedTxn)) { + return -ENOENT; + } + DirNode* const dstDir = mDirs.Find(dstParentFid); + if (! dstDir) { + return -ENOENT; + } + const VersionedDirEntry* const dstEntry = + dstDir->LookupCommitted(dstName, committedTxn); + if (dstEntry) { + const InodeRecord* const dstInode = + mInodes.FindCommitted(dstEntry->childFid, committedTxn); + if (! dstInode) { + return -ENOENT; + } + phaseDstExistsFlag = true; + phaseDstFid = dstInode->fid; + phaseDstType = dstInode->type; + } + } + + std::vector locks; + AddAllInodeShardMutexes(locks); + AddDirShardMutex(locks, parentFid); + AddDirShardMutex(locks, dstParentFid); + if (phaseDstExistsFlag && phaseDstType == kInodeTypeDir) { + AddDirShardMutex(locks, phaseDstFid); + } + ScopedMutexGroup locker(locks); + const TxnId committedTxn = GetCommittedTxnSnapshot(); + if (txnId != kNoTxn && txnId <= committedTxn) { + return -EINVAL; + } + if (! FindCommittedDir(parentFid, committedTxn)) { + return -ENOENT; + } + DirNode* const srcDir = mDirs.Find(parentFid); + if (! srcDir) { + return -ENOENT; + } + const VersionedDirEntry* const srcEntry = + srcDir->LookupCommitted(oldName, committedTxn); + if (! srcEntry) { + return -ENOENT; + } + InodeRecord* const srcInode = mInodes.Find(srcEntry->childFid); + if (! srcInode || ! srcInode->IsVisible(committedTxn)) { + return -ENOENT; + } + if (srcFidPtr) { + *srcFidPtr = srcInode->fid; + } + if (parentFid == dstParentFid && oldName == dstName) { + return 0; + } + if (! FindCommittedDir(dstParentFid, committedTxn)) { + return -ENOENT; + } + if (srcInode->type == kInodeTypeDir && + IsDescendant(srcInode->fid, dstParentFid, committedTxn)) { + return -EINVAL; + } + DirNode* const dstDir = mDirs.Find(dstParentFid); + if (! dstDir) { + return -ENOENT; + } + const VersionedDirEntry* const dstEntry = + dstDir->LookupCommitted(dstName, committedTxn); + InodeRecord* dstInode = 0; + if (dstEntry) { + dstInode = mInodes.Find(dstEntry->childFid); + if (! dstInode || ! dstInode->IsVisible(committedTxn)) { + return -ENOENT; + } + if (! phaseDstExistsFlag || phaseDstFid != dstInode->fid || + phaseDstType != dstInode->type) { + continue; + } + if (! overwriteFlag) { + return -EEXIST; + } + if (srcInode->type != dstInode->type) { + return srcInode->type == kInodeTypeDir ? -ENOTDIR : -EISDIR; + } + if (dstInode->type == kInodeTypeDir) { + const DirNode* const childDir = mDirs.Find(dstInode->fid); + if (! childDir) { + return -ENOENT; + } + if (childDir->GetChildCount() != 0) { + return -ENOTEMPTY; + } + } + } else if (phaseDstExistsFlag) { + continue; + } + + const TxnId opTxnId = txnId == kNoTxn ? AllocateTxnId() : txnId; + if (dstEntry) { + fid_t deletedFid = -1; + status = dstDir->DeletePending(dstName, opTxnId, &deletedFid); + if (status != 0) { + return status; + } + if (! dstInode || deletedFid != dstInode->fid || + ! mInodes.MarkDeleted(deletedFid, opTxnId)) { + return -EIO; + } + } + fid_t movedFid = -1; + status = srcDir->DeletePending(oldName, opTxnId, &movedFid); + if (status != 0) { + return status; + } + if (movedFid != srcInode->fid) { + return -EIO; + } + status = dstDir->InsertPending(dstName, srcInode->fid, opTxnId, + committedTxn, dstEntry != 0); + if (status != 0) { + return status; + } + if (parentFid != dstParentFid && + ! mInodes.Move(srcInode->fid, dstParentFid)) { + return -EIO; + } + if (txnId != kNoTxn) { + AdvanceSeeds(-1, opTxnId); + } + if (txnIdPtr) { + *txnIdPtr = opTxnId; + } + return 0; + } + return -EAGAIN; +} + + int +NamespaceStore::RemoveSelf( + fid_t parentFid, + const std::string& name, + InodeType type, + bool requireEmptyFlag, + TxnId txnId, + TxnId* txnIdPtr) +{ + if (txnIdPtr) { + *txnIdPtr = kNoTxn; + } + if (! IsLegalName(name)) { + return -EINVAL; + } + + fid_t childFid = -1; + { + std::vector parentLocks; + AddDirShardMutex(parentLocks, parentFid); + AddInodeShardMutex(parentLocks, parentFid); + ScopedMutexGroup parentLocker(parentLocks); + const TxnId committedTxn = GetCommittedTxnSnapshot(); + if (txnId != kNoTxn && txnId <= committedTxn) { + return -EINVAL; + } + if (! FindCommittedDir(parentFid, committedTxn)) { + return -ENOENT; + } + DirNode* const dir = mDirs.Find(parentFid); + if (! dir) { + return -ENOENT; + } + const VersionedDirEntry* const committedEntry = + dir->LookupCommitted(name, committedTxn); + if (! committedEntry) { + return -ENOENT; + } + childFid = committedEntry->childFid; + } + + std::vector locks; + AddDirShardMutex(locks, parentFid); + AddInodeShardMutex(locks, parentFid); + AddInodeShardMutex(locks, childFid); + if (requireEmptyFlag) { + AddDirShardMutex(locks, childFid); + } + ScopedMutexGroup locker(locks); + const TxnId committedTxn = GetCommittedTxnSnapshot(); + if (txnId != kNoTxn && txnId <= committedTxn) { + return -EINVAL; + } + if (! FindCommittedDir(parentFid, committedTxn)) { + return -ENOENT; + } + DirNode* const dir = mDirs.Find(parentFid); + if (! dir) { + return -ENOENT; + } + const VersionedDirEntry* const committedEntry = + dir->LookupCommitted(name, committedTxn); + if (! committedEntry || committedEntry->childFid != childFid) { + return -ENOENT; + } + const InodeRecord* const inode = + mInodes.FindCommitted(childFid, committedTxn); + if (! inode) { + return -ENOENT; + } + if (inode->type != type) { + return type == kInodeTypeDir ? -ENOTDIR : -EISDIR; + } + if (requireEmptyFlag) { + const DirNode* const childDir = mDirs.Find(inode->fid); + if (! childDir) { + return -ENOENT; + } + if (childDir->GetChildCount() != 0) { + return -ENOTEMPTY; + } + } + const TxnId opTxnId = txnId == kNoTxn ? AllocateTxnId() : txnId; + fid_t deletedFid = -1; + const int status = dir->DeletePending(name, opTxnId, &deletedFid); + if (status != 0) { + return status; + } + if (deletedFid != childFid || ! mInodes.MarkDeleted(childFid, opTxnId)) { + return -EIO; + } + AdvanceSeeds(-1, opTxnId); + if (txnIdPtr) { + *txnIdPtr = opTxnId; + } + return 0; +} + + int +NamespaceStore::ApplyCreate( + fid_t parentFid, + const std::string& name, + InodeType type, + fid_t childFid, + TxnId txnId, + kfsUid_t user, + kfsGid_t group, + kfsMode_t mode, + int16_t numReplicas, + int64_t mtime, + bool commitFlag, + bool advanceSeedsFlag) +{ + if (parentFid < 0 || childFid < 0 || txnId == kNoTxn) { + return -EINVAL; + } + int status = 0; + { + ScopedSmallMutexGroup locker( + &GetDirShardMutex(parentFid), + &GetInodeShardMutex(parentFid), + &GetInodeShardMutex(childFid), + type == kInodeTypeDir ? &GetDirShardMutex(childFid) : 0); + const TxnId committedTxn = commitFlag ? + GetCommittedTxnSnapshot() : txnId - 1; + status = CreateSelf(parentFid, NameKey(name), type, childFid, txnId, 0, + user, group, mode, numReplicas, mtime, committedTxn); + } + if (advanceSeedsFlag) { + AdvanceSeeds(childFid, txnId); + } + if (commitFlag) { + CommitThrough(txnId); + } + return status; +} + + int +NamespaceStore::ApplyCreateTrusted( + fid_t parentFid, + const std::string& name, + InodeType type, + fid_t childFid, + TxnId txnId, + kfsUid_t user, + kfsGid_t group, + kfsMode_t mode, + int16_t numReplicas, + int64_t mtime, + bool commitFlag, + bool advanceSeedsFlag) +{ + if (parentFid < 0 || childFid < 0 || txnId == kNoTxn) { + return -EINVAL; + } + int status = 0; + { + ScopedSmallMutexGroup locker( + &GetDirShardMutex(parentFid), + &GetInodeShardMutex(parentFid), + &GetInodeShardMutex(childFid), + type == kInodeTypeDir ? &GetDirShardMutex(childFid) : 0); + const TxnId committedTxn = commitFlag ? + GetCommittedTxnSnapshot() : txnId - 1; + status = CreateSelfTrusted(parentFid, NameKey(name), type, + childFid, txnId, 0, user, group, mode, numReplicas, mtime, + committedTxn); + } + if (advanceSeedsFlag) { + AdvanceSeeds(childFid, txnId); + } + if (commitFlag) { + CommitThrough(txnId); + } + return status; +} + + + int +NamespaceStore::ApplyEditLog( + const EditLogRecord& record, + bool commitFlag) +{ + const int validStatus = ValidateEditLogRecord(record); + if (validStatus != 0) { + return validStatus; + } + if (record.txnId <= GetCommittedTxnSnapshot()) { + return -EINVAL; + } + int status = 0; + if (record.type == EditLogRecord::kCreate) { + return ApplyCreate(record.parentFid, record.name, record.inodeType, + record.fid, record.txnId, record.user, record.group, + record.mode, record.numReplicas, record.mtime, commitFlag, true); + } + if (record.type == EditLogRecord::kRemove) { + TxnId txnId = kNoTxn; + status = RemoveSelf(record.parentFid, record.name, record.inodeType, + record.inodeType == kInodeTypeDir, record.txnId, &txnId); + if (status == 0) { + if (txnId != record.txnId) { + return -EINVAL; + } + CommitThrough(record.txnId); + } + return status; + } + if (record.type == EditLogRecord::kRename) { + LookupResult lookup; + status = Lookup(record.parentFid, record.name, lookup); + if (status != 0) { + return status; + } + if (lookup.fid != record.fid) { + return -EINVAL; + } + TxnId txnId = kNoTxn; + fid_t srcFid = -1; + status = RenameSelf(record.parentFid, record.name, record.newPath, + record.overwriteFlag, record.txnId, &txnId, &srcFid); + if (status == 0) { + if (txnId != record.txnId || srcFid != record.fid) { + return -EINVAL; + } + CommitThrough(record.txnId); + } + return status; + } + return -EINVAL; +} + + int +NamespaceStore::ApplyEditLog( + std::istream& is) +{ + std::string line; + while (std::getline(is, line)) { + if (line.empty()) { + continue; + } + EditLogRecord record; + int status = ReadEditLog(line, record); + if (status != 0) { + return status; + } + status = ApplyEditLog(record); + if (status != 0) { + return status; + } + } + return is.bad() ? -EIO : 0; +} + + int +NamespaceStore::SaveCheckpoint( + std::ostream& os) const +{ + if (! os.good()) { + return -EIO; + } + std::vector locks; + AddAllDirShardMutexes(locks); + AddAllInodeShardMutexes(locks); + ScopedMutexGroup locker(locks); + fid_t nextFid = -1; + TxnId nextTxn = kNoTxn; + TxnId committedTxn = kNoTxn; + { + ScopedMutex txnLocker(GetTxnMutex()); + nextFid = mNextFid; + nextTxn = mNextTxn; + committedTxn = mCommittedTxn.load(std::memory_order_relaxed); + } + os << "namespacev2_checkpoint 1\n" << + "state " << mRootFid << " " << nextFid << " " << + nextTxn << " " << committedTxn << " " << + mConfig.dirLargeThreshold << "\n"; + + std::vector inodes; + mInodes.GetCommitted(committedTxn, inodes); + for (std::vector::const_iterator it = inodes.begin(); + it != inodes.end(); + ++it) { + const InodeRecord& inode = *it; + os << "inode " << inode.fid << " " << inode.parentFid << " " << + InodeTypeToInt(inode.type) << " " << inode.generation << " " << + inode.user << " " << inode.group << " " << inode.mode << " " << + inode.numReplicas << " " << inode.mtime << " " << + inode.ctime << " " << inode.atime << "\n"; + } + + std::vector > dirGenerations; + mDirs.GetDirGenerations(dirGenerations); + for (std::vector >::const_iterator it = + dirGenerations.begin(); + it != dirGenerations.end(); + ++it) { + if (mInodes.FindCommitted(it->first, committedTxn)) { + os << "dirgen " << it->first << " " << it->second << "\n"; + } + } + + std::vector entries; + mDirs.GetCommittedEntries(committedTxn, entries); + for (std::vector::const_iterator it = entries.begin(); + it != entries.end(); + ++it) { + if (! mInodes.FindCommitted(it->parentFid, committedTxn) || + ! mInodes.FindCommitted(it->childFid, committedTxn)) { + continue; + } + os << "dentry " << it->parentFid << " " << it->childFid << " " << + EncodeName(it->key.name) << "\n"; + } + os << "end\n"; + return os.good() ? 0 : -EIO; +} + + int +NamespaceStore::SaveCheckpointDiskEntry( + std::ostream& os) const +{ + if (! os.good()) { + return -EIO; + } + std::vector locks; + AddAllDirShardMutexes(locks); + AddAllInodeShardMutexes(locks); + ScopedMutexGroup locker(locks); + fid_t nextFid = -1; + TxnId nextTxn = kNoTxn; + TxnId committedTxn = kNoTxn; + { + ScopedMutex txnLocker(GetTxnMutex()); + nextFid = mNextFid; + nextTxn = mNextTxn; + committedTxn = mCommittedTxn.load(std::memory_order_relaxed); + } + os << "nv2/state/" << mRootFid << "/" << nextFid << "/" << + nextTxn << "/" << committedTxn << "/" << + mConfig.dirLargeThreshold << "\n"; + + std::vector inodes; + mInodes.GetCommitted(committedTxn, inodes); + for (std::vector::const_iterator it = inodes.begin(); + it != inodes.end(); + ++it) { + const InodeRecord& inode = *it; + os << "nv2/inode/" << inode.fid << "/" << inode.parentFid << + "/" << InodeTypeToInt(inode.type) << "/" << + inode.generation << "/" << inode.user << "/" << + inode.group << "/" << inode.mode << "/" << + inode.numReplicas << "/" << inode.mtime << "/" << + inode.ctime << "/" << inode.atime << "\n"; + } + + std::vector > dirGenerations; + mDirs.GetDirGenerations(dirGenerations); + for (std::vector >::const_iterator it = + dirGenerations.begin(); + it != dirGenerations.end(); + ++it) { + if (mInodes.FindCommitted(it->first, committedTxn)) { + os << "nv2/dirgen/" << it->first << "/" << + it->second << "\n"; + } + } + + std::vector entries; + mDirs.GetCommittedEntries(committedTxn, entries); + for (std::vector::const_iterator it = entries.begin(); + it != entries.end(); + ++it) { + if (! mInodes.FindCommitted(it->parentFid, committedTxn) || + ! mInodes.FindCommitted(it->childFid, committedTxn)) { + continue; + } + os << "nv2/dentry/" << it->parentFid << "/" << + it->childFid << "/" << EncodeName(it->key.name) << "\n"; + } + os << "nv2/end\n"; + return os.good() ? 0 : -EIO; +} + + int +NamespaceStore::LoadCheckpoint( + std::istream& is) +{ + std::string magic; + int version = 0; + if (! (is >> magic >> version) || magic != "namespacev2_checkpoint" || + version != 1) { + return -EINVAL; + } + std::string stateTag; + fid_t rootFid = -1; + fid_t nextFid = -1; + TxnId nextTxn = 0; + TxnId committedTxn = 0; + int largeThreshold = 0; + if (! (is >> stateTag >> rootFid >> nextFid >> nextTxn >> + committedTxn >> largeThreshold) || stateTag != "state" || + rootFid < 0 || nextFid <= rootFid || largeThreshold <= 0 || + committedTxn > nextTxn) { + return -EINVAL; + } + + NamespaceStore tmp(mConfig, rootFid); + tmp.mConfig.dirLargeThreshold = largeThreshold; + tmp.mRootFid = rootFid; + tmp.mNextFid = nextFid; + tmp.mNextTxn = nextTxn; + tmp.mCommittedTxn.store(committedTxn, std::memory_order_relaxed); + tmp.mInodes = InodeTable(); + tmp.mDirs = DirTable(); + + std::vector dentries; + std::vector > dirGenerations; + std::string tag; + bool endFlag = false; + while (is >> tag) { + if (tag == "end") { + endFlag = true; + break; + } + if (tag == "inode") { + fid_t fid = -1; + fid_t parentFid = -1; + int typeValue = -1; + uint64_t generation = 0; + kfsUid_t user = kKfsUserRoot; + kfsGid_t group = kKfsGroupRoot; + kfsMode_t mode = 0; + int16_t numReplicas = 1; + int64_t mtime = 0; + int64_t ctime = 0; + int64_t atime = 0; + InodeType type = kInodeTypeFile; + if (! (is >> fid >> parentFid >> typeValue >> generation >> + user >> group >> mode >> numReplicas >> mtime >> + ctime >> atime) || fid < 0 || parentFid < 0 || + ! IntToInodeType(typeValue, type)) { + return -EINVAL; + } + InodeRecord inode(fid, parentFid, type, kNoTxn, + user, group, mode, numReplicas, mtime); + inode.generation = generation; + inode.pendingFlag = false; + inode.ctime = ctime; + inode.atime = atime; + if (! tmp.mInodes.Insert(inode)) { + return -EINVAL; + } + if (type == kInodeTypeDir && ! tmp.mDirs.Insert( + fid, DirNode(tmp.mConfig.dirLargeThreshold, + tmp.mConfig.dirPromoteMaxWallMs))) { + return -EINVAL; + } + } else if (tag == "dirgen") { + fid_t dirFid = -1; + uint64_t generation = 0; + if (! (is >> dirFid >> generation) || dirFid < 0) { + return -EINVAL; + } + dirGenerations.push_back(std::make_pair(dirFid, generation)); + } else if (tag == "dentry") { + fid_t parentFid = -1; + fid_t childFid = -1; + std::string encodedName; + std::string name; + if (! (is >> parentFid >> childFid >> encodedName) || + ! DecodeName(encodedName, name)) { + return -EINVAL; + } + dentries.push_back(CheckpointDirEntry( + parentFid, NameKey(name), childFid)); + } else { + return -EINVAL; + } + } + if (! endFlag) { + return -EINVAL; + } + const InodeRecord* const root = + tmp.mInodes.FindCommitted(rootFid, committedTxn); + if (! root || root->type != kInodeTypeDir || ! tmp.mDirs.Find(rootFid)) { + return -EINVAL; + } + for (std::vector::const_iterator it = dentries.begin(); + it != dentries.end(); + ++it) { + const InodeRecord* const parent = + tmp.mInodes.FindCommitted(it->parentFid, committedTxn); + const InodeRecord* const child = + tmp.mInodes.FindCommitted(it->childFid, committedTxn); + DirNode* const dir = tmp.mDirs.Find(it->parentFid); + if (! parent || parent->type != kInodeTypeDir || ! child || ! dir) { + return -EINVAL; + } + const int status = dir->InsertCommitted(it->key.name, it->childFid); + if (status != 0) { + return status; + } + } + for (std::vector >::const_iterator it = + dirGenerations.begin(); + it != dirGenerations.end(); + ++it) { + DirNode* const dir = tmp.mDirs.Find(it->first); + if (! dir) { + return -EINVAL; + } + dir->SetGeneration(it->second); + } + { + std::vector locks; + AddAllDirShardMutexes(locks); + AddAllInodeShardMutexes(locks); + ScopedMutexGroup locker(locks); + ScopedMutex txnLocker(GetTxnMutex()); + mConfig = tmp.mConfig; + mRootFid = tmp.mRootFid; + mNextFid = tmp.mNextFid; + mNextTxn = tmp.mNextTxn; + mCommittedTxn.store( + tmp.mCommittedTxn.load(std::memory_order_relaxed), + std::memory_order_release); + mPendingCommittedTxns = tmp.mPendingCommittedTxns; + mInodes = tmp.mInodes; + mDirs = tmp.mDirs; + } + return 0; +} + + void +NamespaceStore::CommitThrough( + TxnId committedTxn) +{ + ScopedMutex locker(GetTxnMutex()); + TxnId current = mCommittedTxn.load(std::memory_order_relaxed); + if (committedTxn <= current) { + return; + } + mPendingCommittedTxns.insert(committedTxn); + for (;;) { + std::set::iterator const it = + mPendingCommittedTxns.find(current + 1); + if (it == mPendingCommittedTxns.end()) { + break; + } + mPendingCommittedTxns.erase(it); + ++current; + } + mCommittedTxn.store(current, std::memory_order_release); +} + + void +NamespaceStore::CommitThroughRange( + TxnId firstTxn, + TxnId lastTxn) +{ + if (lastTxn < firstTxn) { + return; + } + ScopedMutex locker(GetTxnMutex()); + TxnId current = mCommittedTxn.load(std::memory_order_relaxed); + if (lastTxn <= current) { + return; + } + if (firstTxn <= current + 1) { + current = lastTxn; + } else { + for (TxnId txnId = firstTxn; txnId <= lastTxn; ++txnId) { + mPendingCommittedTxns.insert(txnId); + if (txnId == lastTxn) { + break; + } + } + } + for (;;) { + std::set::iterator const it = + mPendingCommittedTxns.find(current + 1); + if (it == mPendingCommittedTxns.end()) { + break; + } + mPendingCommittedTxns.erase(it); + ++current; + } + mCommittedTxn.store(current, std::memory_order_release); +} + + const InodeRecord* +NamespaceStore::FindCommittedDir( + fid_t dirFid, + TxnId committedTxn) const +{ + const InodeRecord* const inode = + mInodes.FindCommitted(dirFid, committedTxn); + return inode && inode->type == kInodeTypeDir ? inode : 0; +} + + int +NamespaceStore::ResolveCreateParentDir( + fid_t parentFid, + TxnId committedTxn, + DirNode*& dirPtr) +{ + dirPtr = 0; + if (! FindCommittedDir(parentFid, committedTxn)) { + return -ENOENT; + } + dirPtr = mDirs.Find(parentFid); + if (! dirPtr) { + return -ENOENT; + } + return 0; +} + + int +NamespaceStore::CheckCreateParentName( + fid_t parentFid, + const NameKey& key, + TxnId committedTxn) +{ + DirNode* dir = 0; + const int status = ResolveCreateParentDir(parentFid, committedTxn, dir); + if (status != 0) { + return status; + } + if (dir->HasVisibleOrPendingName(key, committedTxn)) { + return -EEXIST; + } + return 0; +} + + int +NamespaceStore::FillLookupResult( + const InodeRecord& inode, + uint64_t parentGeneration, + LookupResult& result) const +{ + result.fid = inode.fid; + result.type = inode.type; + result.parentGeneration = parentGeneration; + result.user = inode.user; + result.group = inode.group; + result.mode = inode.mode; + result.numReplicas = inode.numReplicas; + result.mtime = inode.mtime; + result.ctime = inode.ctime; + result.atime = inode.atime; + if (inode.type == kInodeTypeDir) { + const DirNode* const dir = mDirs.Find(inode.fid); + result.fileCount = dir ? (int64_t)dir->GetChildCount() : 0; + result.dirCount = 0; + } + return 0; +} + +ResourceLockKey::ResourceLockKey( + Class inResourceClass, + uint64_t inMajor, + uint64_t inMinor) + : resourceClass(inResourceClass), + major(inMajor), + minor(inMinor) + {} + + bool +ResourceLockKey::operator<( + const ResourceLockKey& other) const +{ + if (resourceClass != other.resourceClass) { + return resourceClass < other.resourceClass; + } + if (major != other.major) { + return major < other.major; + } + return minor < other.minor; +} + +} // namespace NamespaceV2 +} // namespace KFS diff --git a/src/cc/meta/NamespaceV2.h b/src/cc/meta/NamespaceV2.h new file mode 100644 index 000000000..5e79c1029 --- /dev/null +++ b/src/cc/meta/NamespaceV2.h @@ -0,0 +1,457 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Memory-native namespace scaffolding for RFC-0001. +// +// Copyright 2026 Quantcast Corporation. All rights reserved. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0. +// +//---------------------------------------------------------------------------- + +#ifndef META_NAMESPACE_V2_H +#define META_NAMESPACE_V2_H + +#include "common/kfsdecls.h" + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +class QCMutex; + +namespace KFS +{ + +class Properties; + +namespace NamespaceV2 +{ + +typedef uint64_t TxnId; + +enum InodeType +{ + kInodeTypeFile, + kInodeTypeDir, + kInodeTypeSymlink +}; + +struct Config +{ + bool enabledFlag; + bool rpcEnabledFlag; + int dirLargeThreshold; + int dirPromoteMaxWallMs; + int dirShardCount; + + Config(); + static Config FromProperties(const Properties& props); +}; + +void SetParameters(const Properties& props); +const Config& GetConfig(); + +struct NameKey +{ + uint64_t hash; + std::string name; + + NameKey(); + explicit NameKey(const std::string& name); + NameKey(uint64_t hash, const std::string& name); + + bool operator<(const NameKey& other) const; + bool operator==(const NameKey& other) const; +}; + +struct NameKeyHash +{ + size_t operator()(const NameKey& key) const; +}; + +struct VersionedDirEntry +{ + fid_t childFid; + TxnId createTxn; + TxnId deleteTxn; + bool pendingFlag; + + VersionedDirEntry(); + VersionedDirEntry(fid_t childFid, TxnId createTxn); + + bool IsVisible(TxnId committedTxn) const; +}; + +struct InodeRecord +{ + fid_t fid; + fid_t parentFid; + InodeType type; + TxnId createTxn; + TxnId deleteTxn; + bool pendingFlag; + uint64_t generation; + kfsUid_t user; + kfsGid_t group; + kfsMode_t mode; + int16_t numReplicas; + int64_t mtime; + int64_t ctime; + int64_t atime; + + InodeRecord(); + InodeRecord(fid_t fid, fid_t parentFid, InodeType type, TxnId createTxn, + kfsUid_t user = kKfsUserRoot, kfsGid_t group = kKfsGroupRoot, + kfsMode_t mode = 0, int16_t numReplicas = 1, int64_t mtime = 0); + + bool IsVisible(TxnId committedTxn) const; +}; + +class InodeTable +{ +public: + InodeTable(); + bool Insert(const InodeRecord& record); + InodeRecord* Find(fid_t fid); + const InodeRecord* Find(fid_t fid) const; + const InodeRecord* FindCommitted(fid_t fid, TxnId committedTxn) const; + bool MarkDeleted(fid_t fid, TxnId deleteTxn); + bool Move(fid_t fid, fid_t parentFid); + void GetCommitted(TxnId committedTxn, + std::vector& records) const; + void CommitThrough(TxnId committedTxn); + size_t Size() const; +private: + typedef std::unordered_map Table; + typedef std::vector Tables; + Tables mTables; +}; + +enum DirState +{ + kDirStateSmall, + kDirStatePromoting, + kDirStateLarge +}; + +struct ReaddirCookie +{ + uint64_t generation; + DirState layout; + bool hasLastKeyFlag; + NameKey lastKey; + + ReaddirCookie(); +}; + +struct ReaddirResult +{ + struct Entry + { + NameKey key; + fid_t childFid; + + Entry(const NameKey& key, fid_t childFid); + }; + + std::vector entries; + bool moreEntriesFlag; + ReaddirCookie nextCookie; + + ReaddirResult(); +}; + +struct CheckpointDirEntry +{ + fid_t parentFid; + NameKey key; + fid_t childFid; + + CheckpointDirEntry(fid_t parentFid, const NameKey& key, + fid_t childFid); +}; + +class DirNode +{ +public: + explicit DirNode( + int largeThreshold = Config().dirLargeThreshold, + int promoteMaxWallMs = Config().dirPromoteMaxWallMs); + + DirState GetState() const; + uint64_t GetGeneration() const; + void SetGeneration(uint64_t generation); + size_t GetChildCount() const; + bool IsLarge() const; + + bool HasVisibleOrPendingName(const NameKey& key, + TxnId committedTxn) const; + bool HasVisibleOrPendingName(const std::string& name, + TxnId committedTxn) const; + int InsertPending(const NameKey& key, fid_t childFid, TxnId txnId, + TxnId committedTxn = 0, bool replaceDeletedFlag = false); + int InsertPending(const std::string& name, fid_t childFid, TxnId txnId, + TxnId committedTxn = 0, bool replaceDeletedFlag = false); + int InsertCommitted(const std::string& name, fid_t childFid); + void GetCommittedEntries(TxnId committedTxn, + std::vector& entries) const; + int DeletePending(const std::string& name, TxnId txnId, + fid_t* childFidPtr = 0); + const VersionedDirEntry* LookupCommitted( + const std::string& name, TxnId committedTxn) const; + int ReaddirCommitted(TxnId committedTxn, const ReaddirCookie* cookiePtr, + size_t maxEntries, ReaddirResult& result) const; + void CommitThrough(TxnId committedTxn); + +private: + typedef std::unordered_map + SmallEntries; + typedef std::unordered_map + LargeEntries; + + DirState GetCookieLayout() const; + int Promote(); + VersionedDirEntry* FindMutable(const NameKey& key); + const VersionedDirEntry* Find(const NameKey& key) const; + void IncrementSmallGeneration(); + + DirState mState; + uint64_t mGeneration; + int mLargeThreshold; + int mPromoteMaxWallMs; + size_t mChildCount; + SmallEntries mSmall; + LargeEntries mLarge; +}; + +class DirTable +{ +public: + DirTable(); + bool Insert(fid_t dirFid, const DirNode& dir); + DirNode* Find(fid_t dirFid); + const DirNode* Find(fid_t dirFid) const; + void CommitThrough(TxnId committedTxn); + size_t Size() const; + void GetCommittedEntries(TxnId committedTxn, + std::vector& entries) const; + void GetDirGenerations( + std::vector >& generations) const; + +private: + typedef std::unordered_map Table; + typedef std::vector
Tables; + Tables mTables; +}; + +struct LookupResult +{ + fid_t fid; + InodeType type; + uint64_t parentGeneration; + kfsUid_t user; + kfsGid_t group; + kfsMode_t mode; + int16_t numReplicas; + int64_t mtime; + int64_t ctime; + int64_t atime; + int64_t fileCount; + int64_t dirCount; + + LookupResult(); +}; + +struct CreateResult +{ + fid_t fid; + TxnId txnId; + uint64_t parentGeneration; + + CreateResult(); +}; + +struct EditLogRecord +{ + enum Type + { + kInvalid, + kCreate, + kRemove, + kRename + }; + + Type type; + TxnId txnId; + fid_t parentFid; + std::string name; + fid_t fid; + InodeType inodeType; + kfsUid_t user; + kfsGid_t group; + kfsMode_t mode; + int16_t numReplicas; + int64_t mtime; + std::string newPath; + bool overwriteFlag; + + EditLogRecord(); +}; + +int WriteEditLog(std::ostream& os, const EditLogRecord& record); +int ReadEditLog(const std::string& line, EditLogRecord& record); + +class NamespaceStore +{ +public: + explicit NamespaceStore( + const Config& config = GetConfig(), fid_t rootFid = ROOTFID); + + fid_t GetRootFid() const; + TxnId GetCommittedTxn() const; + TxnId GetLastTxn() const; + size_t GetInodeCount() const; + size_t GetDirCount() const; + void ReserveCreateIds(fid_t& childFid, TxnId& txnId); + void ReserveCreateIdsRange( + size_t count, fid_t& firstFid, TxnId& firstTxn); + + int Create(fid_t parentFid, const std::string& name, InodeType type, + CreateResult* resultPtr = 0, kfsUid_t user = kKfsUserRoot, + kfsGid_t group = kKfsGroupRoot, kfsMode_t mode = 0, + int16_t numReplicas = 1, int64_t mtime = 0); + // Test/bench helper: ReserveCreateIds + ApplyCreate (pending until CommitThrough). + int ApplyCreatePending(fid_t parentFid, const std::string& name, + InodeType type, CreateResult* resultPtr = 0, + kfsUid_t user = kKfsUserRoot, kfsGid_t group = kKfsGroupRoot, + kfsMode_t mode = 0, int16_t numReplicas = 1, int64_t mtime = 0); + int Lookup(fid_t parentFid, const std::string& name, + LookupResult& result) const; + int LookupPath(fid_t rootFid, const std::string& path, + LookupResult& result) const; + int GetAttr(fid_t fid, LookupResult& result) const; + int Readdir(fid_t dirFid, const ReaddirCookie* cookiePtr, + size_t maxEntries, ReaddirResult& result) const; + int ReaddirFromName(fid_t dirFid, const std::string& name, + size_t maxEntries, ReaddirResult& result) const; + int Remove(fid_t parentFid, const std::string& name, + TxnId* txnIdPtr = 0); + int RemoveFile(fid_t parentFid, const std::string& name, + TxnId* txnIdPtr = 0); + int Rmdir(fid_t parentFid, const std::string& name, + TxnId* txnIdPtr = 0); + int Rename(fid_t parentFid, const std::string& oldName, + const std::string& newPath, bool overwriteFlag, + TxnId* txnIdPtr = 0, fid_t* srcFidPtr = 0); + int ApplyEditLog(const EditLogRecord& record, bool commitFlag = true); + int ApplyEditLog(std::istream& is); + int ApplyCreate(fid_t parentFid, const std::string& name, InodeType type, + fid_t childFid, TxnId txnId, kfsUid_t user = kKfsUserRoot, + kfsGid_t group = kKfsGroupRoot, kfsMode_t mode = 0, + int16_t numReplicas = 1, int64_t mtime = 0, + bool commitFlag = true, bool advanceSeedsFlag = false); + // Trusted apply fast path: assumes WAL / RPC start already validated + // parent/name/type semantics. Replay must not use this. + int ApplyCreateTrusted(fid_t parentFid, const std::string& name, + InodeType type, fid_t childFid, TxnId txnId, + kfsUid_t user = kKfsUserRoot, kfsGid_t group = kKfsGroupRoot, + kfsMode_t mode = 0, int16_t numReplicas = 1, int64_t mtime = 0, + bool commitFlag = true, bool advanceSeedsFlag = false); + int SaveCheckpoint(std::ostream& os) const; + int SaveCheckpointDiskEntry(std::ostream& os) const; + int LoadCheckpoint(std::istream& is); + void CommitThrough(TxnId committedTxn); + void CommitThroughRange(TxnId firstTxn, TxnId lastTxn); + +private: + TxnId GetCommittedTxnSnapshot() const; + TxnId GetLastTxnSnapshot() const; + void AllocateCreateIds(fid_t& childFid, TxnId& txnId); + TxnId AllocateTxnId(); + void AdvanceSeeds(fid_t fid, TxnId txnId); + const InodeRecord* FindCommittedDir( + fid_t dirFid, TxnId committedTxn) const; + int ResolveCreateParentDir( + fid_t parentFid, + TxnId committedTxn, + DirNode*& dirPtr); + int CheckCreateParentName( + fid_t parentFid, + const NameKey& key, + TxnId committedTxn); + int CreateSelf(fid_t parentFid, const NameKey& key, InodeType type, + fid_t childFid, TxnId txnId, CreateResult* resultPtr, + kfsUid_t user, kfsGid_t group, kfsMode_t mode, + int16_t numReplicas, int64_t mtime, + TxnId committedTxn); + int CreateSelfTrusted(fid_t parentFid, const NameKey& key, InodeType type, + fid_t childFid, TxnId txnId, CreateResult* resultPtr, + kfsUid_t user, kfsGid_t group, kfsMode_t mode, + int16_t numReplicas, int64_t mtime, + TxnId committedTxn); + int CreateSelf(fid_t parentFid, const std::string& name, InodeType type, + fid_t childFid, TxnId txnId, CreateResult* resultPtr, + kfsUid_t user, kfsGid_t group, kfsMode_t mode, + int16_t numReplicas, int64_t mtime, + TxnId committedTxn); + int ResolveRenameTarget(fid_t baseDirFid, const std::string& newPath, + fid_t& dstParentFid, std::string& dstName) const; + bool IsDescendant(fid_t ancestorFid, fid_t dirFid, + TxnId committedTxn) const; + int RemoveSelf(fid_t parentFid, const std::string& name, + InodeType type, bool requireEmptyFlag, TxnId txnId, + TxnId* txnIdPtr); + int RenameSelf(fid_t parentFid, const std::string& oldName, + const std::string& newPath, bool overwriteFlag, TxnId txnId, + TxnId* txnIdPtr, fid_t* srcFidPtr); + int FillLookupResult(const InodeRecord& inode, + uint64_t parentGeneration, LookupResult& result) const; + + Config mConfig; + fid_t mRootFid; + fid_t mNextFid; + TxnId mNextTxn; + std::atomic mCommittedTxn; + std::set mPendingCommittedTxns; + InodeTable mInodes; + DirTable mDirs; +}; + +NamespaceStore& GetStore(); + +struct ResourceLockKey +{ + enum Class + { + kSnapshot = 1, + kDir = 2, + kInode = 3, + kBlockMap = 4, + kEditLog = 5 + }; + + Class resourceClass; + uint64_t major; + uint64_t minor; + + ResourceLockKey(Class resourceClass, uint64_t major, uint64_t minor = 0); + + bool operator<(const ResourceLockKey& other) const; +}; + +} // namespace NamespaceV2 +} // namespace KFS + +#endif // META_NAMESPACE_V2_H diff --git a/src/cc/meta/NetDispatch.cc b/src/cc/meta/NetDispatch.cc index 9d1ffcf6b..14de05230 100644 --- a/src/cc/meta/NetDispatch.cc +++ b/src/cc/meta/NetDispatch.cc @@ -1177,6 +1177,50 @@ NetDispatch::Dispatch(MetaRequest *r) } } +void +NetDispatch::DispatchBatch(MetaRequest* const* reqs, size_t count) +{ + if (! reqs || count == 0) { + return; + } + ClientManager::ClientThread* thread = 0; + bool batchFlag = true; + for (size_t i = 0; i < count; ++i) { + MetaRequest* const r = reqs[i]; + if (! r || ! r->clnt) { + continue; + } + ClientManager::ClientThread* const reqThread = + static_cast(r->clnt)->GetClientThread(); + if (! thread) { + thread = reqThread; + } else if (thread != reqThread) { + batchFlag = false; + break; + } + } + if (batchFlag && thread) { + for (size_t i = 0; i < count; ++i) { + MetaRequest* const r = reqs[i]; + if (! r) { + continue; + } + sReqStatsGatherer.OpDone(*r); + r->submitCount = 0; + if (! r->clnt) { + MetaRequest::Release(r); + } + } + mClientManager.EnqueueBatch(thread, reqs, count); + return; + } + for (size_t i = 0; i < count; ++i) { + if (reqs[i]) { + Dispatch(reqs[i]); + } + } +} + void NetDispatch::SetMaxClientSockets(int count) { mClientManager.SetMaxClientSockets(count); @@ -1627,10 +1671,16 @@ class ClientManager::ClientThread : assert(mReqPendingQueue.IsEmpty()); // Dispatch requests. MetaRequest* op; + bool needLogFlushFlag = false; while ((op = reqPendingQueue.PopFront())) { submit_request(op); + if (op->commitPendingFlag) { + needLogFlushFlag = true; + } + } + if (needLogFlushFlag) { + MetaRequest::GetLogWriter().ScheduleFlush(); } - MetaRequest::GetLogWriter().ScheduleFlush(); gNetDispatch.ForkDone(); mPrimaryFlag = gLayoutManager.IsPrimary() && MetaRequest::GetLogWriter().IsPrimary(mNetManager.NowUsec()); @@ -1711,6 +1761,28 @@ class ClientManager::ClientThread : mNetManager.Wakeup(); } } + void EnqueueBatch( + MetaRequest* const* reqs, + size_t count) + { + if (! reqs || count == 0) { + return; + } + QCStMutexLocker locker(mMutex); + const bool wasEmptyFlag = mReqQueue.IsEmpty(); + for (size_t i = 0; i < count; ++i) { + MetaRequest* const op = reqs[i]; + if (! op || ! op->clnt) { + continue; + } + op->next = 0; + mReqQueue.PushBack(*op); + } + locker.Unlock(); + if (wasEmptyFlag) { + mNetManager.Wakeup(); + } + } void Add(NetConnectionPtr& conn) { if (! conn || ! conn->IsGood() || ! mThread.IsStarted()) { @@ -1966,6 +2038,37 @@ ClientManager::GetMutex() return mImpl.GetMutex(); } +void +ClientManager::EnqueueBatch( + ClientManager::ClientThread* thread, + MetaRequest* const* reqs, + size_t count) +{ + if (! thread) { + for (size_t i = 0; i < count; ++i) { + if (reqs[i]) { + gNetDispatch.Dispatch(reqs[i]); + } + } + return; + } + if (! thread->IsStarted()) { + for (size_t i = 0; i < count; ++i) { + MetaRequest* const op = reqs[i]; + if (! op || ! op->clnt) { + if (op) { + MetaRequest::Release(op); + } + continue; + } + op->next = &(*op); + op->clnt->HandleEvent(EVENT_CMD_DONE, op); + } + return; + } + thread->EnqueueBatch(reqs, count); +} + /* static */ bool ClientManager::EnqueueSelf(ClientManager::ClientThread* thread, MetaRequest& op) { diff --git a/src/cc/meta/NetDispatch.h b/src/cc/meta/NetDispatch.h index 9fe324064..8f2f87955 100644 --- a/src/cc/meta/NetDispatch.h +++ b/src/cc/meta/NetDispatch.h @@ -63,6 +63,9 @@ class NetDispatch bool Start(MetaDataSync& metaDataSync); //!< Dispatch completed request. void Dispatch(MetaRequest* r); + //!< Dispatch multiple completed requests with one client-thread wakeup when + //!< they share the same ClientThread. + void DispatchBatch(MetaRequest* const* reqs, size_t count); int SetParameters(const Properties& props); void GetStatsCsv(ostream& os); void GetStatsCsv(IOBuffer& buf); diff --git a/src/cc/meta/Replay.cc b/src/cc/meta/Replay.cc index a22cd1977..2b2e8100a 100644 --- a/src/cc/meta/Replay.cc +++ b/src/cc/meta/Replay.cc @@ -34,6 +34,7 @@ #include "MetaVrSM.h" #include "MetaVrOps.h" #include "MetaDataStore.h" +#include "NamespaceV2.h" #include "common/MdStream.h" #include "common/MsgLogger.h" @@ -44,6 +45,7 @@ #include "common/StBuffer.h" #include "kfsio/checksum.h" +#include "kfsio/Base64.h" #include "qcdio/QCUtils.h" @@ -939,6 +941,191 @@ replay_mkdir(DETokenizer& c) return (ok && 0 == status); } +static int +HexNibble( + char ch) +{ + if ('0' <= ch && ch <= '9') { + return ch - '0'; + } + if ('a' <= ch && ch <= 'f') { + return ch - 'a' + 10; + } + if ('A' <= ch && ch <= 'F') { + return ch - 'A' + 10; + } + return -1; +} + +static bool +DecodeHexBytes( + const string& hex, + string& out) +{ + out.clear(); + if ((hex.size() & 1) != 0) { + return false; + } + out.reserve(hex.size() / 2); + for (size_t i = 0; i < hex.size(); i += 2) { + const int hi = HexNibble(hex[i]); + const int lo = HexNibble(hex[i + 1]); + if (hi < 0 || lo < 0) { + return false; + } + out.push_back(char((hi << 4) | lo)); + } + return true; +} + +static bool +DecodeBase64Bytes( + const string& b64, + string& out) +{ + StBufferT buf; + char* const ptr = buf.Resize(Base64::GetMaxDecodedLength((int)b64.size())); + const int len = Base64::Decode(b64.data(), (int)b64.size(), ptr, true); + if (len <= 0) { + out.clear(); + return false; + } + out.assign(ptr, len); + return true; +} + +class NamespaceV2WalBatchReader +{ +public: + explicit NamespaceV2WalBatchReader( + const string& buf) + : mBuf(buf), + mPos(0) + {} + + template + bool ReadLe(T& out) + { + if (mPos + sizeof(T) > mBuf.size()) { + return false; + } + uint64_t v = 0; + for (size_t i = 0; i < sizeof(T); i++) { + v |= (uint64_t)(unsigned char)mBuf[mPos++] << (i * 8); + } + out = (T)v; + return true; + } + + bool ReadBytes(string& out, size_t len) + { + if (mPos + len > mBuf.size()) { + return false; + } + out.assign(mBuf.data() + mPos, len); + mPos += len; + return true; + } + + bool Done() const { return mPos == mBuf.size(); } + +private: + const string& mBuf; + size_t mPos; +}; + +static bool +replay_nv2batch(DETokenizer& c) +{ + c.pop_front(); // record type + int64_t count = 0; + string enc; + bool b64Flag = false; + bool ok = pop_num(count, "c", c, true); + if (ok) { + // New format: /b/ + if (pop_name(enc, "b", c, ok)) { + b64Flag = true; + } else { + // Legacy format: /h/ + ok = pop_name(enc, "h", c, ok); + } + } + if (! ok || count <= 0) { + return false; + } + string bytes; + if (! (b64Flag ? DecodeBase64Bytes(enc, bytes) : DecodeHexBytes(enc, bytes))) { + return false; + } + NamespaceV2WalBatchReader r(bytes); + NamespaceV2::NamespaceStore& store = NamespaceV2::GetStore(); + NamespaceV2::TxnId firstTxn = 0; + NamespaceV2::TxnId lastTxn = 0; + for (int64_t i = 0; i < count; i++) { + uint8_t opType = 0; + int64_t parentFid = -1; + int64_t fid = -1; + uint64_t txnId = 0; + uint32_t user = 0; + uint32_t group = 0; + uint16_t mode = 0; + int16_t numReplicas = 0; + int64_t mtime = 0; + uint16_t nameLen = 0; + string name; + if (! r.ReadLe(opType) || + ! r.ReadLe(parentFid) || + ! r.ReadLe(fid) || + ! r.ReadLe(txnId) || + ! r.ReadLe(user) || + ! r.ReadLe(group) || + ! r.ReadLe(mode) || + ! r.ReadLe(numReplicas) || + ! r.ReadLe(mtime) || + ! r.ReadLe(nameLen) || + ! r.ReadBytes(name, nameLen)) { + return false; + } + const NamespaceV2::InodeType type = + opType == 2 ? NamespaceV2::kInodeTypeDir : + NamespaceV2::kInodeTypeFile; + const int status = store.ApplyCreate( + (fid_t)parentFid, + name, + type, + (fid_t)fid, + (NamespaceV2::TxnId)txnId, + (kfsUid_t)user, + (kfsGid_t)group, + (kfsMode_t)mode, + numReplicas, + mtime, + false, // commitFlag + true // advanceSeedsFlag + ); + if (status != 0) { + return false; + } + if (firstTxn == 0) { + firstTxn = (NamespaceV2::TxnId)txnId; + } + lastTxn = (NamespaceV2::TxnId)txnId; + } + if (! r.Done()) { + return false; + } + store.CommitThroughRange(firstTxn, lastTxn); + return true; +} + +static bool +replay_nv2batchc(DETokenizer& c) +{ + c.pop_front(); // record type + return true; +} + /*! * \brief replay remove * format: remove/dir//name/ @@ -2158,6 +2345,8 @@ get_entry_map() e.add_parser("version", &replay_version); e.add_parser("create", &replay_create); e.add_parser("mkdir", &replay_mkdir); + e.add_parser("nv2batch", &replay_nv2batch); + e.add_parser("nv2batchc", &replay_nv2batchc); e.add_parser("remove", &replay_remove); e.add_parser("rmdir", &replay_rmdir); e.add_parser("rename", &replay_rename); diff --git a/src/cc/meta/Replay.h b/src/cc/meta/Replay.h index de23d1ff1..5eec6ca17 100644 --- a/src/cc/meta/Replay.h +++ b/src/cc/meta/Replay.h @@ -127,6 +127,7 @@ class Replay bool commitAll(); bool submit(MetaRequest& req) { return (enqueueFlag && enqueue(req)); } + bool isSubmitQueueEnabled() const { return enqueueFlag; } vrNodeId_t getPrimaryNodeId() const { return primaryNodeId; } void handle( diff --git a/src/cc/meta/Restorer.cc b/src/cc/meta/Restorer.cc index 7569363ee..1277c55f5 100644 --- a/src/cc/meta/Restorer.cc +++ b/src/cc/meta/Restorer.cc @@ -36,6 +36,7 @@ #include "NetDispatch.h" #include "LogWriter.h" #include "MetaVrSM.h" +#include "NamespaceV2.h" #include "common/MdStream.h" #include "common/MsgLogger.h" @@ -47,16 +48,167 @@ #include #include #include +#include namespace KFS { using std::cerr; using std::string; using std::ifstream; +using std::stringstream; static int16_t sMinReplicasPerFile = 0; static bool sHasVrSequenceFlag = false; static bool sVrSequenceRequiredFlag = false; +static stringstream sNamespaceV2CheckpointImage; +static bool sNamespaceV2CheckpointStartedFlag = false; + +static void +reset_namespace_v2_checkpoint_image() +{ + sNamespaceV2CheckpointImage.str(string()); + sNamespaceV2CheckpointImage.clear(); + sNamespaceV2CheckpointStartedFlag = false; +} + +static bool +pop_namespace_v2_number( + DETokenizer& c, + int64_t& value) +{ + if (c.empty()) { + return false; + } + value = c.toNumber(); + if (! c.isLastOk()) { + return false; + } + c.pop_front(); + return true; +} + +static bool +pop_namespace_v2_token( + DETokenizer& c, + string& value) +{ + if (c.empty() || c.front().empty()) { + return false; + } + value = c.front(); + c.pop_front(); + return true; +} + +static bool +restore_namespace_v2(DETokenizer& c) +{ + c.pop_front(); + if (c.empty()) { + return false; + } + const DETokenizer::Token tag = c.front(); + c.pop_front(); + if (tag == "state") { + int64_t rootFid = -1; + int64_t nextFid = -1; + int64_t nextTxn = -1; + int64_t committedTxn = -1; + int64_t largeThreshold = -1; + if (! pop_namespace_v2_number(c, rootFid) || + ! pop_namespace_v2_number(c, nextFid) || + ! pop_namespace_v2_number(c, nextTxn) || + ! pop_namespace_v2_number(c, committedTxn) || + ! pop_namespace_v2_number(c, largeThreshold) || + ! c.empty()) { + return false; + } + reset_namespace_v2_checkpoint_image(); + sNamespaceV2CheckpointStartedFlag = true; + sNamespaceV2CheckpointImage << "namespacev2_checkpoint 1\n" << + "state " << rootFid << " " << nextFid << " " << + nextTxn << " " << committedTxn << " " << + largeThreshold << "\n"; + return true; + } + if (! sNamespaceV2CheckpointStartedFlag) { + return false; + } + if (tag == "inode") { + int64_t fid = -1; + int64_t parentFid = -1; + int64_t type = -1; + int64_t generation = -1; + int64_t user = -1; + int64_t group = -1; + int64_t mode = -1; + int64_t numReplicas = -1; + int64_t mtime = -1; + int64_t ctime = -1; + int64_t atime = -1; + if (! pop_namespace_v2_number(c, fid) || + ! pop_namespace_v2_number(c, parentFid) || + ! pop_namespace_v2_number(c, type) || + ! pop_namespace_v2_number(c, generation) || + ! pop_namespace_v2_number(c, user) || + ! pop_namespace_v2_number(c, group) || + ! pop_namespace_v2_number(c, mode) || + ! pop_namespace_v2_number(c, numReplicas) || + ! pop_namespace_v2_number(c, mtime) || + ! pop_namespace_v2_number(c, ctime) || + ! pop_namespace_v2_number(c, atime) || + ! c.empty()) { + return false; + } + sNamespaceV2CheckpointImage << "inode " << fid << " " << + parentFid << " " << type << " " << generation << " " << + user << " " << group << " " << mode << " " << + numReplicas << " " << mtime << " " << ctime << " " << + atime << "\n"; + return true; + } + if (tag == "dirgen") { + int64_t dirFid = -1; + int64_t generation = -1; + if (! pop_namespace_v2_number(c, dirFid) || + ! pop_namespace_v2_number(c, generation) || ! c.empty()) { + return false; + } + sNamespaceV2CheckpointImage << "dirgen " << dirFid << " " << + generation << "\n"; + return true; + } + if (tag == "dentry") { + int64_t parentFid = -1; + int64_t childFid = -1; + string encodedName; + if (! pop_namespace_v2_number(c, parentFid) || + ! pop_namespace_v2_number(c, childFid) || + ! pop_namespace_v2_token(c, encodedName) || ! c.empty()) { + return false; + } + sNamespaceV2CheckpointImage << "dentry " << parentFid << " " << + childFid << " " << encodedName << "\n"; + return true; + } + if (tag != "end" || ! c.empty()) { + return false; + } + sNamespaceV2CheckpointImage << "end\n"; + sNamespaceV2CheckpointImage.clear(); + sNamespaceV2CheckpointImage.seekg(0); + const int status = NamespaceV2::GetStore().LoadCheckpoint( + sNamespaceV2CheckpointImage); + if (status != 0) { + KFS_LOG_STREAM_ERROR << + "namespace v2 checkpoint restore failure: " << status << + KFS_LOG_EOM; + return false; + } + reset_namespace_v2_checkpoint_image(); + return true; +} + static bool checkpoint_seq(DETokenizer& c) @@ -933,6 +1085,7 @@ get_entry_map() e.add_parser("worm", &restore_worm_mode); e.add_parser("ckey", &restore_crypto_key); e.add_parser("shortnames", &restore_short_names); + e.add_parser("nv2", &restore_namespace_v2); Replay::AddRestotreEntries(e); initied = true; return e; @@ -970,6 +1123,7 @@ Restorer::rebuild(const string& cpname, int16_t minReplicas) } sMinReplicasPerFile = minReplicas; sVrSequenceRequiredFlag = mVrSequenceRequiredFlag; + reset_namespace_v2_checkpoint_image(); ifstream file; file.open(cpname.c_str(), ifstream::binary | ifstream::in); if (file.fail()) { @@ -1015,6 +1169,13 @@ Restorer::rebuild(const string& cpname, int16_t minReplicas) is_ok = false; } file.close(); + if (is_ok && sNamespaceV2CheckpointStartedFlag) { + KFS_LOG_STREAM_FATAL << + cpname << ": incomplete namespace v2 checkpoint image" << + KFS_LOG_EOM; + is_ok = false; + } + reset_namespace_v2_checkpoint_image(); if (is_ok && lastLineChecksumFlag) { const string md = mds.GetMd(); if (restoreChecksum != md) { diff --git a/src/cc/meta/namespacev2bench_main.cc b/src/cc/meta/namespacev2bench_main.cc new file mode 100644 index 000000000..4570cb5d8 --- /dev/null +++ b/src/cc/meta/namespacev2bench_main.cc @@ -0,0 +1,367 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Micro benchmark for RFC-0001 NamespaceV2 scaffolding. +// +// Copyright 2026 Quantcast Corporation. All rights reserved. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0. +// +//---------------------------------------------------------------------------- + +#include "NamespaceV2.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace KFS; +using namespace KFS::NamespaceV2; +using std::cout; +using std::string; +using std::vector; + +namespace +{ + +struct Options +{ + uint64_t entries; + uint64_t dirs; + uint64_t lookupSamples; + size_t readdirPageSize; + int largeThreshold; + + Options() + : entries(1000000), + dirs(1), + lookupSamples(100000), + readdirPageSize(1000), + largeThreshold(Config().dirLargeThreshold) + {} +}; + + uint64_t +NowUsec() +{ + timeval tv; + gettimeofday(&tv, 0); + return uint64_t(tv.tv_sec) * 1000000 + tv.tv_usec; +} + + double +OpsPerSec( + uint64_t count, + uint64_t usec) +{ + return usec == 0 ? 0 : (double(count) * 1000000.0) / double(usec); +} + + bool +ParseUInt64( + const char* namePtr, + const char* valuePtr, + uint64_t& value) +{ + char* endPtr = 0; + errno = 0; + const unsigned long long parsed = strtoull(valuePtr, &endPtr, 10); + if (errno != 0 || ! endPtr || *endPtr != 0) { + cout << "invalid " << namePtr << ": " << valuePtr << "\n"; + return false; + } + value = uint64_t(parsed); + return true; +} + + bool +ParseSize( + const char* namePtr, + const char* valuePtr, + size_t& value) +{ + uint64_t parsed = 0; + if (! ParseUInt64(namePtr, valuePtr, parsed)) { + return false; + } + value = size_t(parsed); + return true; +} + + bool +ParseInt( + const char* namePtr, + const char* valuePtr, + int& value) +{ + char* endPtr = 0; + errno = 0; + const long parsed = strtol(valuePtr, &endPtr, 10); + if (errno != 0 || ! endPtr || *endPtr != 0 || parsed <= 0) { + cout << "invalid " << namePtr << ": " << valuePtr << "\n"; + return false; + } + value = int(parsed); + return true; +} + + void +Usage( + const char* progPtr) +{ + cout << + "Usage: " << progPtr << " [options]\n" + " --entries N number of file creates, default 1000000\n" + " --dirs N parent directories to spread creates, default 1\n" + " --lookup-samples N committed lookups to sample, default 100000\n" + " --readdir-page N committed readdir page size, default 1000\n" + " --threshold N Small to Large promotion threshold, default 4096\n" + ; +} + + bool +ParseOptions( + int argc, + char** argv, + Options& options) +{ + for (int i = 1; i < argc; i++) { + const char* const argPtr = argv[i]; + if (strcmp(argPtr, "--help") == 0 || strcmp(argPtr, "-h") == 0) { + Usage(argv[0]); + return false; + } + if (i + 1 >= argc) { + cout << "missing value for " << argPtr << "\n"; + Usage(argv[0]); + return false; + } + const char* const valuePtr = argv[++i]; + if (strcmp(argPtr, "--entries") == 0) { + if (! ParseUInt64(argPtr, valuePtr, options.entries)) { + return false; + } + } else if (strcmp(argPtr, "--dirs") == 0) { + if (! ParseUInt64(argPtr, valuePtr, options.dirs)) { + return false; + } + } else if (strcmp(argPtr, "--lookup-samples") == 0) { + if (! ParseUInt64(argPtr, valuePtr, options.lookupSamples)) { + return false; + } + } else if (strcmp(argPtr, "--readdir-page") == 0) { + if (! ParseSize(argPtr, valuePtr, options.readdirPageSize)) { + return false; + } + } else if (strcmp(argPtr, "--threshold") == 0) { + if (! ParseInt(argPtr, valuePtr, options.largeThreshold)) { + return false; + } + } else { + cout << "unknown option: " << argPtr << "\n"; + Usage(argv[0]); + return false; + } + } + options.entries = std::max(1, options.entries); + options.dirs = std::max(1, options.dirs); + options.lookupSamples = std::min(options.entries, + std::max(1, options.lookupSamples)); + options.readdirPageSize = std::max(1, options.readdirPageSize); + return true; +} + + string +MakeName( + const char* prefixPtr, + uint64_t index) +{ + char buf[64]; + snprintf(buf, sizeof(buf), "%s%llu", prefixPtr, + (unsigned long long)index); + return string(buf); +} + + int +CreateParentDirs( + NamespaceStore& store, + const Options& options, + vector& parentFids) +{ + parentFids.clear(); + if (options.dirs == 1) { + parentFids.push_back(store.GetRootFid()); + return 0; + } + parentFids.reserve(size_t(options.dirs)); + for (uint64_t i = 0; i < options.dirs; i++) { + CreateResult result; + const int status = store.Create( + store.GetRootFid(), MakeName("d_", i), kInodeTypeDir, &result); + if (status != 0) { + cout << "mkdir failed index=" << i << " status=" << status << "\n"; + return status; + } + parentFids.push_back(result.fid); + } + store.CommitThrough(store.GetLastTxn()); + return 0; +} + + int +BenchmarkCreate( + NamespaceStore& store, + const Options& options, + const vector& parentFids) +{ + const uint64_t startUsec = NowUsec(); + for (uint64_t i = 0; i < options.entries; i++) { + const fid_t parentFid = parentFids[size_t(i % parentFids.size())]; + const int status = store.ApplyCreatePending( + parentFid, MakeName("f_", i), kInodeTypeFile); + if (status != 0) { + cout << "create failed index=" << i << " status=" << status << + "\n"; + return status; + } + } + const uint64_t createUsec = NowUsec() - startUsec; + cout << "create count=" << options.entries << + " usec=" << createUsec << + " ops_per_sec=" << OpsPerSec(options.entries, createUsec) << "\n"; + + const uint64_t commitStartUsec = NowUsec(); + store.CommitThrough(store.GetLastTxn()); + const uint64_t commitUsec = NowUsec() - commitStartUsec; + cout << "commit txn=" << store.GetCommittedTxn() << + " usec=" << commitUsec << "\n"; + return 0; +} + + int +BenchmarkLookup( + const NamespaceStore& store, + const Options& options, + const vector& parentFids) +{ + uint64_t found = 0; + const uint64_t stride = std::max( + 1, options.entries / options.lookupSamples); + const uint64_t startUsec = NowUsec(); + for (uint64_t n = 0, i = 0; n < options.lookupSamples; n++, + i = (i + stride) % options.entries) { + const fid_t parentFid = parentFids[size_t(i % parentFids.size())]; + LookupResult result; + const int status = store.Lookup(parentFid, MakeName("f_", i), result); + if (status == 0) { + found++; + } else { + cout << "lookup failed index=" << i << " status=" << status << + "\n"; + return status; + } + } + const uint64_t lookupUsec = NowUsec() - startUsec; + cout << "lookup count=" << options.lookupSamples << + " found=" << found << + " usec=" << lookupUsec << + " ops_per_sec=" << OpsPerSec(options.lookupSamples, lookupUsec) << + "\n"; + return 0; +} + + int +BenchmarkReaddir( + const NamespaceStore& store, + const Options& options, + const vector& parentFids) +{ + uint64_t entryCount = 0; + uint64_t pageCount = 0; + const uint64_t startUsec = NowUsec(); + for (size_t i = 0; i < parentFids.size(); i++) { + ReaddirCookie cookie; + const ReaddirCookie* cookiePtr = 0; + do { + ReaddirResult result; + const int status = store.Readdir(parentFids[i], cookiePtr, + options.readdirPageSize, result); + if (status != 0) { + cout << "readdir failed parent_index=" << i << + " status=" << status << "\n"; + return status; + } + entryCount += result.entries.size(); + pageCount++; + cookie = result.nextCookie; + cookiePtr = result.moreEntriesFlag ? &cookie : 0; + if (! result.moreEntriesFlag) { + break; + } + } while (true); + } + const uint64_t readdirUsec = NowUsec() - startUsec; + cout << "readdir entries=" << entryCount << + " pages=" << pageCount << + " usec=" << readdirUsec << + " entries_per_sec=" << OpsPerSec(entryCount, readdirUsec) << "\n"; + return entryCount == options.entries ? 0 : -EIO; +} + +} // namespace + + int +main( + int argc, + char** argv) +{ + Options options; + if (! ParseOptions(argc, argv, options)) { + return 1; + } + + Config cfg; + cfg.enabledFlag = true; + cfg.dirLargeThreshold = options.largeThreshold; + NamespaceStore store(cfg); + vector parentFids; + + cout << "namespacev2bench entries=" << options.entries << + " dirs=" << options.dirs << + " threshold=" << options.largeThreshold << + " lookup_samples=" << options.lookupSamples << + " readdir_page=" << options.readdirPageSize << "\n"; + + uint64_t startUsec = NowUsec(); + int status = CreateParentDirs(store, options, parentFids); + uint64_t setupUsec = NowUsec() - startUsec; + if (status != 0) { + return 1; + } + cout << "setup dirs=" << parentFids.size() << + " usec=" << setupUsec << "\n"; + + status = BenchmarkCreate(store, options, parentFids); + if (status != 0) { + return 1; + } + status = BenchmarkLookup(store, options, parentFids); + if (status != 0) { + return 1; + } + status = BenchmarkReaddir(store, options, parentFids); + if (status != 0) { + return 1; + } + return 0; +} diff --git a/src/cc/meta/namespacev2test_main.cc b/src/cc/meta/namespacev2test_main.cc new file mode 100644 index 000000000..071da6aeb --- /dev/null +++ b/src/cc/meta/namespacev2test_main.cc @@ -0,0 +1,886 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// $Id$ +// +// Unit tests for RFC-0001 NamespaceV2 scaffolding. +// +// Copyright 2026 Quantcast Corporation. All rights reserved. +// +// This file is part of Kosmos File System (KFS). +// +// Licensed under the Apache License, Version 2.0. +// +//---------------------------------------------------------------------------- + +#include "NamespaceV2.h" + +#include "common/Properties.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace KFS; +using namespace KFS::NamespaceV2; +using std::cout; +using std::string; +using std::vector; + +namespace +{ + +int gErrorCount = 0; + + void +Check( + bool okFlag, + const char* msgPtr) +{ + if (! okFlag) { + cout << "FAILED: " << msgPtr << "\n"; + gErrorCount++; + } +} + + string +MakeName( + const char* prefixPtr, + int first, + int second = -1) +{ + std::ostringstream os; + os << prefixPtr << first; + if (second >= 0) { + os << "_" << second; + } + return os.str(); +} + + void +TestConfig() +{ + Properties props; + props.setValue("metaServer.namespaceV2.enabled", "1"); + props.setValue("metaServer.namespaceV2.rpcEnabled", "1"); + props.setValue("metaServer.dir.largeThreshold", "8"); + props.setValue("metaServer.dir.promoteMaxWallMs", "77"); + props.setValue("metaServer.namespaceV2.dirShardCount", "16"); + SetParameters(props); + const Config& cfg = GetConfig(); + Check(cfg.enabledFlag, "namespace v2 enabled flag"); + Check(cfg.rpcEnabledFlag, "namespace v2 rpc enabled flag"); + Check(cfg.dirLargeThreshold == 8, "large threshold"); + Check(cfg.dirPromoteMaxWallMs == 77, "promotion wall limit"); + Check(cfg.dirShardCount == 16, "dir shard count"); +} + + void +TestNameKey() +{ + const NameKey a("a"); + const NameKey b("b"); + Check(a == NameKey(a.hash, "a"), "name key equality"); + Check((a < b) || (b < a), "name key strict ordering"); +} + + void +TestPendingCommittedVisibility() +{ + DirNode dir(4); + Check(dir.InsertPending("f", 100, 10) == 0, "insert pending"); + Check(dir.LookupCommitted("f", 9) == 0, "pending create invisible"); + dir.CommitThrough(10); + const VersionedDirEntry* entry = dir.LookupCommitted("f", 10); + Check(entry && entry->childFid == 100, "committed create visible"); + fid_t deletedFid = -1; + Check(dir.DeletePending("f", 20, &deletedFid) == 0, "delete pending"); + Check(deletedFid == 100, "delete returns child fid"); + Check(dir.LookupCommitted("f", 19) != 0, "pending delete still visible"); + Check(dir.InsertPending("f", 101, 21) == -EEXIST, + "recreate rejected while delete pending"); + dir.CommitThrough(20); + Check(dir.LookupCommitted("f", 20) == 0, "committed delete invisible"); + Check(dir.InsertPending("f", 101, 30) == 0, + "recreate after delete commit"); + Check(dir.LookupCommitted("f", 29) == 0, "recreate pending invisible"); + dir.CommitThrough(30); + entry = dir.LookupCommitted("f", 30); + Check(entry && entry->childFid == 101, "recreate committed visible"); +} + + void +TestSmallCookieInvalidation() +{ + DirNode dir(10); + Check(dir.InsertPending("b", 2, 1) == 0, "insert b"); + Check(dir.InsertPending("a", 1, 2) == 0, "insert a"); + dir.CommitThrough(2); + ReaddirResult res; + Check(dir.ReaddirCommitted(2, 0, 1, res) == 0, "small readdir first page"); + Check(res.entries.size() == 1, "small readdir page size"); + const ReaddirCookie oldCookie = res.nextCookie; + Check(dir.InsertPending("c", 3, 3) == 0, "insert c invalidates small cookie"); + dir.CommitThrough(3); + Check(dir.ReaddirCommitted(3, &oldCookie, 1, res) == -EINVAL, + "old small cookie rejected"); +} + + void +TestPromotion() +{ + DirNode dir(2); + Check(dir.InsertPending("a", 1, 1) == 0, "promotion insert a"); + Check(dir.InsertPending("b", 2, 2) == 0, "promotion insert b"); + Check(! dir.IsLarge(), "at threshold still small"); + const uint64_t genBefore = dir.GetGeneration(); + Check(dir.InsertPending("c", 3, 3) == 0, "promotion insert c"); + Check(dir.IsLarge(), "promoted to large"); + Check(dir.GetGeneration() > genBefore, "promotion increments generation"); + dir.CommitThrough(3); + Check(dir.LookupCommitted("a", 3) != 0, "large lookup a"); + Check(dir.LookupCommitted("b", 3) != 0, "large lookup b"); + Check(dir.LookupCommitted("c", 3) != 0, "large lookup c"); +} + + void +TestLargeCookieStableAcrossInsert() +{ + DirNode dir(1); + Check(dir.InsertPending("a", 1, 1) == 0, "large insert a"); + Check(dir.InsertPending("b", 2, 2) == 0, "large insert b"); + Check(dir.InsertPending("d", 4, 3) == 0, "large insert d"); + dir.CommitThrough(3); + Check(dir.IsLarge(), "large state"); + ReaddirResult res; + Check(dir.ReaddirCommitted(3, 0, 1, res) == 0, "large first page"); + Check(res.entries.size() == 1, "large page size"); + const ReaddirCookie cookie = res.nextCookie; + const uint64_t gen = dir.GetGeneration(); + Check(dir.InsertPending("c", 3, 4) == 0, "large insert keeps cookie valid"); + dir.CommitThrough(4); + Check(dir.GetGeneration() == gen, "large create does not bump generation"); + Check(dir.ReaddirCommitted(4, &cookie, 10, res) == 0, + "old large NameKey cookie remains valid"); + Check(! res.entries.empty(), "large resume returns entries"); + for (size_t i = 0; i < res.entries.size(); i++) { + Check(cookie.lastKey < res.entries[i].key, + "large resume returns entries after last key"); + } +} + + void +TestInodeTable() +{ + InodeTable table; + Check(table.Insert(InodeRecord(10, 1, kInodeTypeFile, 5)), + "inode insert"); + Check(! table.Insert(InodeRecord(10, 1, kInodeTypeFile, 6)), + "duplicate inode insert rejected"); + Check(table.FindCommitted(10, 4) == 0, "pending inode invisible"); + Check(table.FindCommitted(10, 5) != 0, "committed inode visible"); + Check(table.MarkDeleted(10, 8), "inode mark deleted"); + Check(table.FindCommitted(10, 7) != 0, "pending inode delete visible"); + Check(table.FindCommitted(10, 8) == 0, "committed inode delete invisible"); +} + + void +TestNamespaceStoreBasic() +{ + Config cfg; + cfg.dirLargeThreshold = 2; + NamespaceStore store(cfg); + LookupResult lookup; + Check(store.GetRootFid() == ROOTFID, "namespace root fid"); + Check(store.GetInodeCount() == 1, "namespace root inode"); + Check(store.GetDirCount() == 1, "namespace root dir"); + Check(store.Lookup(ROOTFID, "f", lookup) == -ENOENT, + "namespace missing lookup"); + + CreateResult create; + Check(store.Create(ROOTFID, "f", kInodeTypeFile, &create) == 0, + "namespace create file"); + Check(create.fid > ROOTFID, "namespace create fid assigned"); + Check(store.GetLastTxn() == create.txnId, "namespace txn assigned"); + Check(store.Lookup(ROOTFID, "f", lookup) == -ENOENT, + "namespace pending create invisible"); + Check(store.Create(ROOTFID, "f", kInodeTypeFile, 0) == -EEXIST, + "namespace duplicate pending create rejected"); + Check(store.GetLastTxn() == create.txnId, + "namespace failed create does not consume txn"); + + store.CommitThrough(create.txnId); + Check(store.Lookup(ROOTFID, "f", lookup) == 0 && + lookup.fid == create.fid && lookup.type == kInodeTypeFile, + "namespace committed create visible"); + ReaddirResult readdir; + Check(store.Readdir(ROOTFID, 0, 10, readdir) == 0 && + readdir.entries.size() == 1 && + readdir.entries[0].childFid == create.fid, + "namespace readdir committed create"); + + TxnId deleteTxn = 0; + Check(store.Remove(ROOTFID, "f", &deleteTxn) == 0, + "namespace remove file"); + Check(deleteTxn > create.txnId, "namespace remove txn assigned"); + Check(store.Lookup(ROOTFID, "f", lookup) == 0, + "namespace pending remove still visible"); + Check(store.Create(ROOTFID, "f", kInodeTypeFile, 0) == -EEXIST, + "namespace recreate rejected while remove pending"); + Check(store.GetLastTxn() == deleteTxn, + "namespace failed recreate does not consume txn"); + store.CommitThrough(deleteTxn); + Check(store.Lookup(ROOTFID, "f", lookup) == -ENOENT, + "namespace committed remove invisible"); + Check(store.Create(ROOTFID, "f", kInodeTypeFile, 0) == 0, + "namespace recreate after committed remove"); +} + + void +TestNamespaceStoreDirectory() +{ + Config cfg; + cfg.dirLargeThreshold = 1; + NamespaceStore store(cfg); + CreateResult dir; + Check(store.Create(ROOTFID, "d", kInodeTypeDir, &dir) == 0, + "namespace mkdir"); + Check(store.GetDirCount() == 2, "namespace dir table insert pending dir"); + Check(store.Create(dir.fid, "before_commit", kInodeTypeFile, 0) == -ENOENT, + "namespace pending dir not usable"); + Check(store.GetLastTxn() == dir.txnId, + "namespace failed child create does not consume txn"); + store.CommitThrough(dir.txnId); + + CreateResult first; + Check(store.Create(dir.fid, "a", kInodeTypeFile, &first) == 0, + "namespace child create"); + store.CommitThrough(first.txnId); + LookupResult lookup; + Check(store.Lookup(dir.fid, "a", lookup) == 0 && lookup.fid == first.fid, + "namespace child lookup"); + + CreateResult second; + Check(store.Create(dir.fid, "b", kInodeTypeFile, &second) == 0, + "namespace child create promotes dir"); + store.CommitThrough(second.txnId); + ReaddirResult readdir; + Check(store.Readdir(dir.fid, 0, 10, readdir) == 0 && + readdir.entries.size() == 2, + "namespace child readdir after promotion"); +} + + void +TestNamespaceStorePathAndRmdir() +{ + Config cfg; + NamespaceStore store(cfg); + CreateResult dir; + Check(store.Create(ROOTFID, "d", kInodeTypeDir, &dir, + 10, 20, 0755, 0, 100) == 0, + "namespace path mkdir d"); + store.CommitThrough(dir.txnId); + + CreateResult child; + Check(store.Create(dir.fid, "c", kInodeTypeDir, &child, + 11, 21, 0750, 0, 101) == 0, + "namespace path mkdir child"); + store.CommitThrough(child.txnId); + + CreateResult file; + Check(store.Create(child.fid, "f", kInodeTypeFile, &file, + 12, 22, 0644, 2, 102) == 0, + "namespace path create file"); + store.CommitThrough(file.txnId); + + LookupResult lookup; + Check(store.LookupPath(ROOTFID, "/d/c/f", lookup) == 0 && + lookup.fid == file.fid && lookup.user == 12 && + lookup.group == 22 && lookup.mode == 0644 && + lookup.numReplicas == 2, + "namespace lookup path attrs"); + Check(store.LookupPath(ROOTFID, "/d/c/f/x", lookup) == -ENOTDIR, + "namespace lookup below file rejected"); + Check(store.RemoveFile(dir.fid, "c", 0) == -EISDIR, + "namespace remove dir as file rejected"); + Check(store.Rmdir(child.fid, "f", 0) == -ENOTDIR, + "namespace rmdir file rejected"); + Check(store.Rmdir(dir.fid, "c", 0) == -ENOTEMPTY, + "namespace rmdir non-empty rejected"); + + TxnId txnId = 0; + Check(store.RemoveFile(child.fid, "f", &txnId) == 0, + "namespace remove child file"); + store.CommitThrough(txnId); + Check(store.Rmdir(dir.fid, "c", &txnId) == 0, + "namespace rmdir empty child"); + store.CommitThrough(txnId); + Check(store.LookupPath(ROOTFID, "/d/c", lookup) == -ENOENT, + "namespace rmdir committed invisible"); +} + + + void +TestNamespaceStoreRename() +{ + Config cfg; + cfg.dirLargeThreshold = 2; + NamespaceStore store(cfg); + + CreateResult d; + Check(store.Create(ROOTFID, "d", kInodeTypeDir, &d) == 0, + "rename mkdir d"); + store.CommitThrough(d.txnId); + CreateResult e; + Check(store.Create(ROOTFID, "e", kInodeTypeDir, &e) == 0, + "rename mkdir e"); + store.CommitThrough(e.txnId); + + CreateResult file; + Check(store.Create(d.fid, "f", kInodeTypeFile, &file) == 0, + "rename create source file"); + store.CommitThrough(file.txnId); + + TxnId txnId = 0; + fid_t srcFid = -1; + Check(store.Rename(d.fid, "f", "g", false, &txnId, &srcFid) == 0 && + srcFid == file.fid, + "rename same dir file"); + store.CommitThrough(txnId); + LookupResult lookup; + Check(store.Lookup(d.fid, "f", lookup) == -ENOENT, + "rename source removed"); + Check(store.Lookup(d.fid, "g", lookup) == 0 && lookup.fid == file.fid, + "rename target visible"); + + Check(store.Rename(d.fid, "g", "/e/h", false, &txnId, &srcFid) == 0, + "rename cross dir absolute target"); + store.CommitThrough(txnId); + Check(store.Lookup(d.fid, "g", lookup) == -ENOENT, + "cross dir source removed"); + Check(store.Lookup(e.fid, "h", lookup) == 0 && lookup.fid == file.fid, + "cross dir target visible"); + + CreateResult other; + Check(store.Create(e.fid, "z", kInodeTypeFile, &other) == 0, + "rename create overwrite target"); + store.CommitThrough(other.txnId); + Check(store.Rename(e.fid, "h", "z", false, 0, 0) == -EEXIST, + "rename overwrite disabled"); + Check(store.Rename(e.fid, "h", "z", true, &txnId, &srcFid) == 0, + "rename overwrite file"); + store.CommitThrough(txnId); + Check(store.Lookup(e.fid, "h", lookup) == -ENOENT, + "overwrite source removed"); + Check(store.Lookup(e.fid, "z", lookup) == 0 && lookup.fid == file.fid, + "overwrite target replaced"); + + CreateResult x; + Check(store.Create(ROOTFID, "x", kInodeTypeFile, &x) == 0, + "rename create type mismatch file"); + store.CommitThrough(x.txnId); + CreateResult y; + Check(store.Create(ROOTFID, "y", kInodeTypeDir, &y) == 0, + "rename create type mismatch dir"); + store.CommitThrough(y.txnId); + Check(store.Rename(ROOTFID, "x", "y", true, 0, 0) == -EISDIR, + "rename file over dir rejected"); + + CreateResult a; + Check(store.Create(ROOTFID, "a", kInodeTypeDir, &a) == 0, + "rename create ancestor dir"); + store.CommitThrough(a.txnId); + CreateResult b; + Check(store.Create(a.fid, "b", kInodeTypeDir, &b) == 0, + "rename create descendant dir"); + store.CommitThrough(b.txnId); + Check(store.Rename(ROOTFID, "a", "/a/b/c", false, 0, 0) == -EINVAL, + "rename dir into descendant rejected"); + + CreateResult p; + Check(store.Create(ROOTFID, "p", kInodeTypeDir, &p) == 0, + "rename create source dir"); + store.CommitThrough(p.txnId); + CreateResult q; + Check(store.Create(ROOTFID, "q", kInodeTypeDir, &q) == 0, + "rename create non-empty target dir"); + store.CommitThrough(q.txnId); + CreateResult qChild; + Check(store.Create(q.fid, "child", kInodeTypeFile, &qChild) == 0, + "rename create target child"); + store.CommitThrough(qChild.txnId); + Check(store.Rename(ROOTFID, "p", "q", true, 0, 0) == -ENOTEMPTY, + "rename over non-empty dir rejected"); +} + + + void +TestNamespaceStoreCheckpoint() +{ + Config cfg; + cfg.dirLargeThreshold = 1; + NamespaceStore store(cfg); + + CreateResult d; + Check(store.Create(ROOTFID, "d", kInodeTypeDir, &d, + 10, 20, 0755, 0, 100) == 0, + "checkpoint mkdir d"); + store.CommitThrough(d.txnId); + CreateResult e; + Check(store.Create(ROOTFID, "e", kInodeTypeDir, &e, + 11, 21, 0750, 0, 101) == 0, + "checkpoint mkdir e"); + store.CommitThrough(e.txnId); + + CreateResult removed; + Check(store.Create(d.fid, "removed", kInodeTypeFile, &removed) == 0, + "checkpoint create removed file"); + store.CommitThrough(removed.txnId); + TxnId txnId = 0; + Check(store.RemoveFile(d.fid, "removed", &txnId) == 0, + "checkpoint remove file"); + store.CommitThrough(txnId); + + CreateResult file; + Check(store.Create(d.fid, "f", kInodeTypeFile, &file, + 12, 22, 0644, 3, 102) == 0, + "checkpoint create file"); + store.CommitThrough(file.txnId); + Check(store.Rename(d.fid, "f", "/e/g", false, &txnId, 0) == 0, + "checkpoint rename file"); + store.CommitThrough(txnId); + + LookupResult originalMoved; + Check(store.LookupPath(ROOTFID, "/e/g", originalMoved) == 0, + "checkpoint original moved lookup"); + + std::stringstream image; + Check(store.SaveCheckpoint(image) == 0, + "checkpoint save"); + + NamespaceStore restored(cfg); + Check(restored.LoadCheckpoint(image) == 0, + "checkpoint load"); + Check(restored.GetCommittedTxn() == store.GetCommittedTxn(), + "checkpoint committed txn restored"); + Check(restored.GetLastTxn() == store.GetLastTxn(), + "checkpoint last txn restored"); + + LookupResult lookup; + Check(restored.LookupPath(ROOTFID, "/e/g", lookup) == 0 && + lookup.fid == file.fid && lookup.user == 12 && + lookup.group == 22 && lookup.mode == 0644 && + lookup.numReplicas == 3 && + lookup.parentGeneration == originalMoved.parentGeneration, + "checkpoint restored moved file attrs"); + Check(restored.LookupPath(ROOTFID, "/d/f", lookup) == -ENOENT, + "checkpoint old rename source absent"); + Check(restored.LookupPath(ROOTFID, "/d/removed", lookup) == -ENOENT, + "checkpoint removed file absent"); + + ReaddirResult readdir; + Check(restored.Readdir(e.fid, 0, 10, readdir) == 0 && + readdir.entries.size() == 1 && + readdir.entries[0].childFid == file.fid, + "checkpoint restored dir entries"); + + const TxnId lastTxn = restored.GetLastTxn(); + CreateResult after; + Check(restored.Create(ROOTFID, "after", kInodeTypeFile, &after) == 0, + "checkpoint create after restore"); + Check(after.txnId == lastTxn + 1 && after.fid > file.fid, + "checkpoint seeds continue after restore"); +} + + void +TestNamespaceStoreEditLog() +{ + Config cfg; + cfg.dirLargeThreshold = 1; + NamespaceStore store(cfg); + + CreateResult d; + Check(store.Create(ROOTFID, "d", kInodeTypeDir, &d, + 10, 20, 0755, 0, 100) == 0, + "edit log mkdir d"); + store.CommitThrough(d.txnId); + CreateResult e; + Check(store.Create(ROOTFID, "e", kInodeTypeDir, &e, + 11, 21, 0750, 0, 101) == 0, + "edit log mkdir e"); + store.CommitThrough(e.txnId); + + std::stringstream checkpoint; + Check(store.SaveCheckpoint(checkpoint) == 0, + "edit log checkpoint save"); + + std::stringstream logs; + CreateResult tmp; + Check(store.Create(d.fid, "tmp", kInodeTypeFile, &tmp, + 30, 40, 0600, 1, 200) == 0, + "edit log create tmp"); + EditLogRecord tmpCreate; + tmpCreate.type = EditLogRecord::kCreate; + tmpCreate.txnId = tmp.txnId; + tmpCreate.parentFid = d.fid; + tmpCreate.name = "tmp"; + tmpCreate.fid = tmp.fid; + tmpCreate.inodeType = kInodeTypeFile; + tmpCreate.user = 30; + tmpCreate.group = 40; + tmpCreate.mode = 0600; + tmpCreate.numReplicas = 1; + tmpCreate.mtime = 200; + Check(WriteEditLog(logs, tmpCreate) == 0, + "edit log write tmp create"); + store.CommitThrough(tmp.txnId); + + TxnId removeTxn = 0; + Check(store.RemoveFile(d.fid, "tmp", &removeTxn) == 0, + "edit log remove tmp"); + EditLogRecord tmpRemove; + tmpRemove.type = EditLogRecord::kRemove; + tmpRemove.txnId = removeTxn; + tmpRemove.parentFid = d.fid; + tmpRemove.name = "tmp"; + tmpRemove.inodeType = kInodeTypeFile; + Check(WriteEditLog(logs, tmpRemove) == 0, + "edit log write tmp remove"); + store.CommitThrough(removeTxn); + + CreateResult file; + Check(store.Create(d.fid, "f", kInodeTypeFile, &file, + 31, 41, 0644, 3, 201) == 0, + "edit log create file"); + EditLogRecord fileCreate; + fileCreate.type = EditLogRecord::kCreate; + fileCreate.txnId = file.txnId; + fileCreate.parentFid = d.fid; + fileCreate.name = "f"; + fileCreate.fid = file.fid; + fileCreate.inodeType = kInodeTypeFile; + fileCreate.user = 31; + fileCreate.group = 41; + fileCreate.mode = 0644; + fileCreate.numReplicas = 3; + fileCreate.mtime = 201; + Check(WriteEditLog(logs, fileCreate) == 0, + "edit log write file create"); + store.CommitThrough(file.txnId); + + TxnId renameTxn = 0; + fid_t srcFid = -1; + Check(store.Rename(d.fid, "f", "/e/g", false, + &renameTxn, &srcFid) == 0 && srcFid == file.fid, + "edit log rename file"); + EditLogRecord rename; + rename.type = EditLogRecord::kRename; + rename.txnId = renameTxn; + rename.parentFid = d.fid; + rename.name = "f"; + rename.fid = srcFid; + rename.newPath = "/e/g"; + rename.overwriteFlag = false; + Check(WriteEditLog(logs, rename) == 0, + "edit log write rename"); + store.CommitThrough(renameTxn); + + NamespaceStore restored(cfg); + Check(restored.LoadCheckpoint(checkpoint) == 0, + "edit log checkpoint load"); + Check(restored.ApplyEditLog(logs) == 0, + "edit log replay stream"); + + LookupResult lookup; + Check(restored.LookupPath(ROOTFID, "/e/g", lookup) == 0 && + lookup.fid == file.fid && lookup.user == 31 && + lookup.group == 41 && lookup.mode == 0644 && + lookup.numReplicas == 3, + "edit log replay moved file attrs"); + Check(restored.LookupPath(ROOTFID, "/d/f", lookup) == -ENOENT, + "edit log replay old rename source absent"); + Check(restored.LookupPath(ROOTFID, "/d/tmp", lookup) == -ENOENT, + "edit log replay removed file absent"); + Check(restored.GetCommittedTxn() == store.GetCommittedTxn(), + "edit log committed txn restored"); + Check(restored.GetLastTxn() == store.GetLastTxn(), + "edit log last txn restored"); + + CreateResult after; + Check(restored.Create(ROOTFID, "after_log", kInodeTypeFile, &after) == 0, + "edit log create after replay"); + Check(after.txnId == store.GetLastTxn() + 1 && after.fid > file.fid, + "edit log seeds continue after replay"); + + EditLogRecord parsed; + Check(ReadEditLog("namespacev2_edit 1 create 0 3 4 0 1 1 0 1 1 61", + parsed) == -EINVAL, + "edit log rejects zero txn"); +} + + void +TestNamespaceStoreEditLogFailedCreateNoop() +{ + Config cfg; + NamespaceStore store(cfg); + + fid_t fid = -1; + TxnId txnId = 0; + store.ReserveCreateIds(fid, txnId); + EditLogRecord first; + first.type = EditLogRecord::kCreate; + first.txnId = txnId; + first.parentFid = ROOTFID; + first.name = "dup"; + first.fid = fid; + first.inodeType = kInodeTypeFile; + first.mode = 0644; + Check(store.ApplyEditLog(first) == 0, + "edit log first create succeeds"); + + fid_t failedFid = -1; + TxnId failedTxnId = 0; + store.ReserveCreateIds(failedFid, failedTxnId); + EditLogRecord duplicate(first); + duplicate.txnId = failedTxnId; + duplicate.fid = failedFid; + Check(store.ApplyEditLog(duplicate) == -EEXIST, + "edit log duplicate create fails"); + Check(store.GetCommittedTxn() == failedTxnId, + "edit log failed create commits no-op txn"); + Check(store.GetLastTxn() == failedTxnId, + "edit log failed create advances txn seed"); + + LookupResult lookup; + Check(store.Lookup(ROOTFID, "dup", lookup) == 0 && lookup.fid == fid, + "edit log failed create keeps original entry"); + + CreateResult after; + Check(store.Create(ROOTFID, "after_failed", kInodeTypeFile, &after) == 0, + "edit log create after failed no-op"); + Check(after.txnId == failedTxnId + 1 && after.fid > failedFid, + "edit log create continues after failed no-op"); +} + + + void +TestNamespaceStoreEditLogBatchCreateCommit() +{ + Config cfg; + NamespaceStore store(cfg); + + fid_t parentFid = -1; + TxnId parentTxn = 0; + store.ReserveCreateIds(parentFid, parentTxn); + EditLogRecord parent; + parent.type = EditLogRecord::kCreate; + parent.txnId = parentTxn; + parent.parentFid = ROOTFID; + parent.name = "batch_parent"; + parent.fid = parentFid; + parent.inodeType = kInodeTypeDir; + parent.mode = 0755; + Check(store.ApplyCreate(parent.parentFid, parent.name, parent.inodeType, + parent.fid, parent.txnId, kKfsUserRoot, kKfsGroupRoot, + parent.mode, 0, 0, false, false) == 0, + "batch parent create succeeds"); + Check(store.GetCommittedTxn() < parentTxn, + "edit log batch parent not globally committed yet"); + + fid_t childFid = -1; + TxnId childTxn = 0; + store.ReserveCreateIds(childFid, childTxn); + EditLogRecord child; + child.type = EditLogRecord::kCreate; + child.txnId = childTxn; + child.parentFid = parentFid; + child.name = "child"; + child.fid = childFid; + child.inodeType = kInodeTypeDir; + child.mode = 0755; + Check(store.ApplyCreate(child.parentFid, child.name, child.inodeType, + child.fid, child.txnId, kKfsUserRoot, kKfsGroupRoot, + child.mode, 0, 0, false, false) == 0, + "batch child sees prior parent create"); + + store.CommitThroughRange(parentTxn, childTxn); + LookupResult lookup; + Check(store.Lookup(parentFid, "child", lookup) == 0 && + lookup.fid == childFid, + "edit log batch committed child lookup"); + Check(store.GetCommittedTxn() == childTxn, + "edit log batch commit through last txn"); +} + + + void +TestNamespaceStoreConcurrentShardLocks() +{ + Config cfg; + cfg.dirLargeThreshold = 8; + NamespaceStore store(cfg); + + const int kThreads = 8; + const int kFilesPerThread = 100; + vector parents; + parents.reserve(kThreads); + for (int t = 0; t < kThreads; ++t) { + CreateResult dir; + Check(store.Create(ROOTFID, MakeName("cd", t), + kInodeTypeDir, &dir) == 0, + "concurrent mkdir parent"); + store.CommitThrough(dir.txnId); + parents.push_back(dir.fid); + } + + std::atomic failures(0); + vector threads; + for (int t = 0; t < kThreads; ++t) { + threads.push_back(std::thread([&store, &parents, &failures, t]() { + for (int i = 0; i < kFilesPerThread; ++i) { + CreateResult create; + const int status = store.Create(parents[t], + MakeName("f", t, i), kInodeTypeFile, &create); + if (status != 0) { + ++failures; + continue; + } + store.CommitThrough(create.txnId); + } + })); + } + for (size_t i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + Check(failures.load() == 0, "concurrent create threads"); + + LookupResult lookup; + for (int t = 0; t < kThreads; ++t) { + for (int i = 0; i < kFilesPerThread; ++i) { + Check(store.Lookup(parents[t], MakeName("f", t, i), + lookup) == 0, + "concurrent create lookup"); + } + } + + failures = 0; + threads.clear(); + for (int t = 0; t < kThreads; ++t) { + threads.push_back(std::thread([&store, &parents, &failures, t]() { + for (int i = 0; i < kFilesPerThread; ++i) { + TxnId txnId = 0; + const int status = store.Rename(parents[t], + MakeName("f", t, i), MakeName("g", t, i), + false, &txnId, 0); + if (status != 0) { + ++failures; + continue; + } + store.CommitThrough(txnId); + } + })); + } + for (size_t i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + Check(failures.load() == 0, "concurrent rename threads"); + for (int t = 0; t < kThreads; ++t) { + for (int i = 0; i < kFilesPerThread; ++i) { + Check(store.Lookup(parents[t], MakeName("f", t, i), + lookup) == -ENOENT, + "concurrent rename old missing"); + Check(store.Lookup(parents[t], MakeName("g", t, i), + lookup) == 0, + "concurrent rename new visible"); + } + } + + failures = 0; + threads.clear(); + for (int t = 0; t < kThreads; ++t) { + threads.push_back(std::thread([&store, &parents, &failures, t]() { + for (int i = 0; i < kFilesPerThread; ++i) { + TxnId txnId = 0; + const int status = store.RemoveFile(parents[t], + MakeName("g", t, i), &txnId); + if (status != 0) { + ++failures; + continue; + } + store.CommitThrough(txnId); + } + })); + } + for (size_t i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + Check(failures.load() == 0, "concurrent remove threads"); + for (int t = 0; t < kThreads; ++t) { + for (int i = 0; i < kFilesPerThread; ++i) { + Check(store.Lookup(parents[t], MakeName("g", t, i), + lookup) == -ENOENT, + "concurrent remove missing"); + } + } +} + + + void +TestResourceLockOrdering() +{ + vector locks; + locks.push_back(ResourceLockKey(ResourceLockKey::kInode, 10)); + locks.push_back(ResourceLockKey(ResourceLockKey::kDir, 3, 20)); + locks.push_back(ResourceLockKey(ResourceLockKey::kSnapshot, 0)); + locks.push_back(ResourceLockKey(ResourceLockKey::kDir, 2, 30)); + locks.push_back(ResourceLockKey(ResourceLockKey::kEditLog, 0)); + std::sort(locks.begin(), locks.end()); + Check(locks[0].resourceClass == ResourceLockKey::kSnapshot, + "snapshot lock first"); + Check(locks[1].resourceClass == ResourceLockKey::kDir && + locks[1].major == 2, + "dir locks sorted by shard"); + Check(locks[2].resourceClass == ResourceLockKey::kDir && + locks[2].major == 3, + "dir locks sorted by shard second"); + Check(locks[3].resourceClass == ResourceLockKey::kInode, + "inode after dir"); + Check(locks[4].resourceClass == ResourceLockKey::kEditLog, + "edit log last"); +} + +} // namespace + + int +main( + int /* argc */, + char** /* argv */) +{ + TestConfig(); + TestNameKey(); + TestPendingCommittedVisibility(); + TestSmallCookieInvalidation(); + TestPromotion(); + TestLargeCookieStableAcrossInsert(); + TestInodeTable(); + TestNamespaceStoreBasic(); + TestNamespaceStoreDirectory(); + TestNamespaceStorePathAndRmdir(); + TestNamespaceStoreRename(); + TestNamespaceStoreCheckpoint(); + TestNamespaceStoreEditLog(); + TestNamespaceStoreEditLogFailedCreateNoop(); + TestNamespaceStoreEditLogBatchCreateCommit(); + TestNamespaceStoreConcurrentShardLocks(); + TestResourceLockOrdering(); + if (gErrorCount != 0) { + cout << gErrorCount << " NamespaceV2 tests failed\n"; + return 1; + } + cout << "NamespaceV2 tests passed\n"; + return 0; +} diff --git a/src/cc/meta/namespacev2walreplaytest_main.cc b/src/cc/meta/namespacev2walreplaytest_main.cc new file mode 100644 index 000000000..b15177166 --- /dev/null +++ b/src/cc/meta/namespacev2walreplaytest_main.cc @@ -0,0 +1,169 @@ +//---------------------------------------------------------- -*- Mode: C++ -*- +// Minimal integration test: +// write nv2batch WAL line -> Replay::playLine() -> validate NamespaceV2 state. +// +// This intentionally bypasses LogWriter / disk IO, and tests the log record +// format + replay parser + NamespaceV2 apply/commit chain. + +#include "Replay.h" +#include "NamespaceV2.h" + +#include "kfsio/Base64.h" + +#include + +#include +#include +#include + +using std::string; +using std::vector; +using std::cout; + +namespace +{ + +static void +AppendLe( + string& out, + uint64_t v, + size_t bytes) +{ + for (size_t i = 0; i < bytes; i++) { + out.push_back((char)(v >> (i * 8))); + } +} + +static void +AppendU8(string& out, uint8_t v) { AppendLe(out, v, 1); } +static void +AppendU16(string& out, uint16_t v) { AppendLe(out, v, 2); } +static void +AppendU32(string& out, uint32_t v) { AppendLe(out, v, 4); } +static void +AppendU64(string& out, uint64_t v) { AppendLe(out, v, 8); } +static void +AppendI16(string& out, int16_t v) { AppendLe(out, (uint16_t)v, 2); } +static void +AppendI64(string& out, int64_t v) { AppendLe(out, (uint64_t)v, 8); } + +static string +EncodeNv2BatchPayload() +{ + // Two ops: + // 1) mkdir /d + // 2) create /d/f + const KFS::fid_t root = KFS::ROOTFID; + const KFS::fid_t dFid = 1001; + const KFS::fid_t fFid = 1002; + // Use contiguous txn ids, and let main() seed committed txn to 0. + const uint64_t dTxn = 1; + const uint64_t fTxn = 2; + const uint32_t user = 1; + const uint32_t group = 1; + const uint16_t modeDir = 0755; + const uint16_t modeFile = 0644; + const int16_t repl = 1; + const int64_t mtime = 123456789; + + string payload; + payload.reserve(256); + + // opType=2 mkdir + AppendU8(payload, 2); + AppendI64(payload, (int64_t)root); + AppendI64(payload, (int64_t)dFid); + AppendU64(payload, dTxn); + AppendU32(payload, user); + AppendU32(payload, group); + AppendU16(payload, modeDir); + AppendI16(payload, 0); + AppendI64(payload, mtime); + const string dname("d"); + AppendU16(payload, (uint16_t)dname.size()); + payload.append(dname); + + // opType=1 create file + AppendU8(payload, 1); + AppendI64(payload, (int64_t)dFid); + AppendI64(payload, (int64_t)fFid); + AppendU64(payload, fTxn); + AppendU32(payload, user); + AppendU32(payload, group); + AppendU16(payload, modeFile); + AppendI16(payload, repl); + AppendI64(payload, mtime); + const string fname("f"); + AppendU16(payload, (uint16_t)fname.size()); + payload.append(fname); + + return payload; +} + +static string +Base64Encode( + const string& bytes) +{ + vector buf((size_t)KFS::Base64::GetEncodedMaxBufSize((int)bytes.size())); + const int len = KFS::Base64::Encode(bytes.data(), (int)bytes.size(), &buf[0], true); + if (len <= 0) { + return string(); + } + return string(&buf[0], len); +} + +static int +Fail( + const char* msg) +{ + cout << "FAILED: " << msg << "\n"; + return 1; +} + +} // anonymous + +int +main() +{ + // Ensure namespace v2 store exists in this process. + KFS::NamespaceV2::NamespaceStore& store = KFS::NamespaceV2::GetStore(); + // Simulate "checkpoint committed txn == 0" so that replay can commit a + // contiguous txn range starting at 1. + store.CommitThroughRange(0, 0); + (void)store; + + const string payload = EncodeNv2BatchPayload(); + const string b64 = Base64Encode(payload); + if (b64.empty()) { + return Fail("base64 encode"); + } + + // One nv2batch line + one placeholder, to mimic WAL sequence count. + const string line1 = "nv2batch/c/2/b/" + b64 + "\n"; + const string line2 = "nv2batchc\n"; + + // Use a fresh block seq for each line. + int status = KFS::replayer.playLine(line1.data(), (int)line1.size(), 1); + if (status != 0) { + cout << "nv2batch line: " << line1; + return Fail("replay nv2batch"); + } + status = KFS::replayer.playLine(line2.data(), (int)line2.size(), 2); + if (status != 0) { + return Fail("replay nv2batchc"); + } + + // Verify namespace state after replay commit. + KFS::NamespaceV2::LookupResult d; + if (KFS::NamespaceV2::GetStore().Lookup(KFS::ROOTFID, "d", d) != 0 || + d.type != KFS::NamespaceV2::kInodeTypeDir) { + return Fail("lookup dir d"); + } + KFS::NamespaceV2::LookupResult f; + if (KFS::NamespaceV2::GetStore().Lookup(d.fid, "f", f) != 0 || + f.type != KFS::NamespaceV2::kInodeTypeFile) { + return Fail("lookup file f"); + } + cout << "NamespaceV2 WAL replay integration test passed\n"; + return 0; +} diff --git a/src/cc/qcdio/QCThread.cc b/src/cc/qcdio/QCThread.cc index bc6c3e6d5..61b340ab8 100644 --- a/src/cc/qcdio/QCThread.cc +++ b/src/cc/qcdio/QCThread.cc @@ -125,12 +125,7 @@ QCThread::~QCThread() QCThread::Join(); } -const int kMinThreadStackSize = -#ifdef PTHREAD_STACK_MIN - PTHREAD_STACK_MIN + (4 << 10); -#else - (8 << 10); -#endif +const int kMinThreadStackSize = 256 << 10; int QCThread::TryToStart( diff --git a/src/cc/tools/qfsput_main.cc b/src/cc/tools/qfsput_main.cc index 30a189be2..129c51cea 100644 --- a/src/cc/tools/qfsput_main.cc +++ b/src/cc/tools/qfsput_main.cc @@ -58,7 +58,7 @@ main(int argc, char **argv) const char* config = 0; ssize_t numBytes; - while ((optchar = getopt(argc, argv, "hs:p:f:v")) != -1) { + while ((optchar = getopt(argc, argv, "hs:p:f:c:v")) != -1) { switch (optchar) { case 'f': kfspathname = optarg; @@ -105,9 +105,7 @@ main(int argc, char **argv) } numBytes = doPut(kfspathname); - if (numBytes <= 0) { - cout << "Wrote " << numBytes << " to " << kfspathname << "\n"; - } + cout << "Wrote " << numBytes << " to " << kfspathname << "\n"; delete gKfsClient; return (numBytes < 0 ? 1 : 0); @@ -124,12 +122,13 @@ doPut(const string &filename) cout << "Create failed: " << ErrorCodeToStr(fd) << "\n"; return fd; } - while(cin.read(dataBuf, sizeof(dataBuf))) { + while(cin.read(dataBuf, sizeof(dataBuf)) || cin.gcount() > 0) { const size_t cnt = cin.gcount(); const int res = gKfsClient->Write(fd, dataBuf, cnt); if (res != (int)cnt) { cout << "Write failed...expect to write: " << cnt << " but only wrote: " << res << "\n"; + gKfsClient->Close(fd); return -1; } bytesWritten += res;