From a361e277cdab1204c5dfa79da56b1a69287d12df Mon Sep 17 00:00:00 2001
From: zhangzhibiao <zhangzhibiao@bigo.sg>
Date: Fri, 22 May 2026 10:34:00 +0800
Subject: [PATCH 1/7] Place build artifacts under output

---
 CMakeLists.txt                    | 40 ++++++++++++++++++++++--
 Makefile                          | 51 ++++++++++++++++++++++---------
 benchmarks/mstress/CMakeLists.txt |  2 ++
 examples/cc/CMakeLists.txt        |  3 ++
 src/cc/devtools/CMakeLists.txt    |  2 ++
 src/cc/emulator/CMakeLists.txt    |  2 ++
 src/cc/qcrs/CMakeLists.txt        |  2 ++
 src/cc/qfsc/CMakeLists.txt        |  2 ++
 src/cc/tools/CMakeLists.txt       |  2 ++
 9 files changed, 90 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 297d17bd7..330ece184 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -135,10 +135,46 @@ if(NOT OPENSSL_VERSION OR OPENSSL_VERSION MATCHES "^0[.]")
     message(STATUS "WARNING: QFS authentication will not work properly")
 endif()
 
+if(NOT DEFINED QFS_OUTPUT_DIR)
+    set(QFS_OUTPUT_DIR "${KFS_DIR_PREFIX}/output" CACHE PATH
+        "directory for deployable QFS build output")
+endif()
+get_filename_component(QFS_OUTPUT_DIR "${QFS_OUTPUT_DIR}" ABSOLUTE
+    BASE_DIR "${KFS_DIR_PREFIX}")
+
 # Change this to where the install directory is located
-if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
-    set(CMAKE_INSTALL_PREFIX "." CACHE PATH "install directory prefix" FORCE)
+if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT OR
+    CMAKE_INSTALL_PREFIX STREQUAL ".")
+    set(CMAKE_INSTALL_PREFIX "${QFS_OUTPUT_DIR}" CACHE PATH
+        "install directory prefix" FORCE)
 endif()
+message(STATUS "QFS deployable output directory: ${CMAKE_INSTALL_PREFIX}")
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${QFS_OUTPUT_DIR}/bin" CACHE PATH
+    "directory for QFS runtime build output" FORCE)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${QFS_OUTPUT_DIR}/lib" CACHE PATH
+    "directory for QFS shared library build output" FORCE)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${QFS_OUTPUT_DIR}/lib/static" CACHE PATH
+    "directory for QFS static library build output" FORCE)
+foreach(QFS_OUTPUT_CONFIG DEBUG RELEASE RELWITHDEBINFO MINSIZEREL)
+    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_${QFS_OUTPUT_CONFIG}
+        "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}" CACHE PATH
+        "directory for QFS runtime build output" FORCE)
+    set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_${QFS_OUTPUT_CONFIG}
+        "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" CACHE PATH
+        "directory for QFS shared library build output" FORCE)
+    set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_${QFS_OUTPUT_CONFIG}
+        "${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}" CACHE PATH
+        "directory for QFS static library build output" FORCE)
+endforeach()
+
+function(qfs_set_target_runtime_output_dir output_dir)
+    set_target_properties(${ARGN} PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY "${output_dir}")
+    foreach(QFS_OUTPUT_CONFIG DEBUG RELEASE RELWITHDEBINFO MINSIZEREL)
+        set_target_properties(${ARGN} PROPERTIES
+            RUNTIME_OUTPUT_DIRECTORY_${QFS_OUTPUT_CONFIG} "${output_dir}")
+    endforeach()
+endfunction()
 
 # Build with statically linked libraries; the value for this variable has to be
 # defined here overwriting whatever is in the cache.
diff --git a/Makefile b/Makefile
index 9a1da4d61..1c10e3227 100644
--- a/Makefile
+++ b/Makefile
@@ -22,6 +22,7 @@
 # Do not assume GNU Make. Keep this makefile as simple as possible.
 
 BUILD_TYPE=release
+QFS_OUTPUT_DIR=output
 CMAKE_OPTIONS=-D CMAKE_BUILD_TYPE=RelWithDebInfo
 CMAKE=cmake
 MAKE_OPTIONS=
@@ -31,7 +32,7 @@ QFSHADOOP_VERSIONS=0.23.11  1.0.4  1.1.2  2.5.1  2.7.2  2.7.7  2.8.5  2.9.2  2.1
 
 QFS_PYTHON_DIR=python-qfs
 QFS_PYTHON_WHEEL_DIR=${QFS_PYTHON_DIR}/dist
-QFS_PYTHON_TEST_OPTION=test -d ${QFS_PYTHON_WHEEL_DIR} && echo -python-wheel-dir ${QFS_PYTHON_WHEEL_DIR}
+QFS_PYTHON_TEST_OPTION=test -d $${qfs_output_dir}/${QFS_PYTHON_WHEEL_DIR} && echo -python-wheel-dir $${qfs_output_dir}/${QFS_PYTHON_WHEEL_DIR}
 QFS_MSTRESS_ON=true
 
 .PHONY: all
@@ -39,11 +40,16 @@ all: build
 
 .PHONY: dir
 dir:
-	mkdir -p build/${BUILD_TYPE}
+	mkdir -p build/${BUILD_TYPE} ${QFS_OUTPUT_DIR}
 
 .PHONY: run-cmake
 run-cmake: dir
-	cd build/${BUILD_TYPE} && ${CMAKE} ${CMAKE_OPTIONS} ../..
+	cd build/${BUILD_TYPE} && \
+		qfs_output_dir=`cd ../.. && pwd`/${QFS_OUTPUT_DIR} && \
+		${CMAKE} \
+			-D QFS_OUTPUT_DIR="$$qfs_output_dir" \
+			-D CMAKE_INSTALL_PREFIX="$$qfs_output_dir" \
+			${CMAKE_OPTIONS} ../..
 
 .PHONY: build
 build: run-cmake
@@ -51,11 +57,19 @@ build: run-cmake
 	`${QFS_MSTRESS_ON} && \
 		echo ${QFSHADOOP_VERSIONS} | grep '3\.4\.1' >/dev/null 2>&1 && \
 		mvn --version >/dev/null 2>&1 && echo mstress-bootstrap mstress-tarball`
+	if ls -1 build/${BUILD_TYPE}/benchmarks/mstress*.tgz >/dev/null 2>&1; then \
+		mkdir -p ${QFS_OUTPUT_DIR}/benchmarks && \
+		cp build/${BUILD_TYPE}/benchmarks/mstress*.tgz \
+			${QFS_OUTPUT_DIR}/benchmarks/; \
+	fi
 
 .PHONY: java
 java: build
 	./src/java/javabuild.sh ${JAVA_BUILD_OPTIONS} clean
 	./src/java/javabuild.sh ${JAVA_BUILD_OPTIONS}
+	if ls -1 build/java/qfs-access/qfs-access*.jar >/dev/null 2>&1; then \
+		cp build/java/qfs-access/qfs-access*.jar ${QFS_OUTPUT_DIR}/lib/; \
+	fi
 
 .PHONY: hadoop-jars
 hadoop-jars: java
@@ -67,6 +81,9 @@ hadoop-jars: java
 	            || exit 1; \
 	    done \
 	; fi
+	if ls -1 build/java/hadoop-qfs/hadoop-*.jar >/dev/null 2>&1; then \
+		cp build/java/hadoop-qfs/hadoop-*.jar ${QFS_OUTPUT_DIR}/lib/; \
+	fi
 
 .PHONY: go
 go: build
@@ -76,7 +93,7 @@ go: build
 			exit; \
 		} \
 		END { exit ret ? 0 : 1 }'; then \
-		QFS_BUILD_DIR=`pwd`/build/$(BUILD_TYPE) && \
+		QFS_BUILD_DIR=`pwd`/${QFS_OUTPUT_DIR} && \
 		cd src/go && \
 		CGO_CFLAGS="-I$${QFS_BUILD_DIR}/include" && \
 		export CGO_CFLAGS && \
@@ -93,6 +110,7 @@ go: build
 .PHONY: tarball
 tarball: hadoop-jars python
 	cd build && \
+	qfs_output_dir=../${QFS_OUTPUT_DIR}; \
 	myuname=`uname -s`; \
 	myarch=`cc -dumpmachine 2>/dev/null | cut -d - -f 1` ; \
 	[ x"$$myarch" = x ] && \
@@ -129,16 +147,16 @@ tarball: hadoop-jars python
 	{ test -d tmpreldir || mkdir tmpreldir; } && \
 	rm -rf "tmpreldir/$$tarname" && \
 	mkdir "tmpreldir/$$tarname" && \
-	cp -r ${BUILD_TYPE}/bin ${BUILD_TYPE}/lib \
-		${BUILD_TYPE}/include ../scripts ../webui \
+	cp -r $$qfs_output_dir/bin $$qfs_output_dir/lib \
+		$$qfs_output_dir/include ../scripts ../webui \
 	     ../examples ../benchmarks "tmpreldir/$$tarname/" && \
 	if ls -1 ./java/qfs-access/qfs-access-*.jar >/dev/null 2>&1; then \
 	    cp ./java/qfs-access/qfs-access*.jar "tmpreldir/$$tarname/lib/"; fi && \
 	if ls -1 ./java/hadoop-qfs/hadoop-*.jar >/dev/null 2>&1; then \
 	    cp ./java/hadoop-qfs/hadoop-*.jar "tmpreldir/$$tarname/lib/"; fi && \
-	if ls -1 ${BUILD_TYPE}/${QFS_PYTHON_WHEEL_DIR}/qfs*.whl >/dev/null 2>&1; \
+	if ls -1 $$qfs_output_dir/${QFS_PYTHON_WHEEL_DIR}/qfs*.whl >/dev/null 2>&1; \
 		then \
-		cp ${BUILD_TYPE}/${QFS_PYTHON_WHEEL_DIR}/qfs*.whl \
+		cp $$qfs_output_dir/${QFS_PYTHON_WHEEL_DIR}/qfs*.whl \
 			"tmpreldir/$$tarname/lib/"; fi && \
 	if ls -1 ${BUILD_TYPE}/benchmarks/mstress.tgz > /dev/null 2>&1; then \
 		cp ${BUILD_TYPE}/benchmarks/mstress.tgz \
@@ -151,7 +169,7 @@ python: build
 	if python3 -c 'import sys; exit(0 if sys.version_info >= (3, 6) else 1)' \
 			>/dev/null 2>&1 && \
 			python3 -c 'import venv' >/dev/null 2>&1 ; then \
-		cd build/${BUILD_TYPE} && \
+		cd ${QFS_OUTPUT_DIR} && \
 		rm -rf ${QFS_PYTHON_DIR} && \
 		mkdir ${QFS_PYTHON_DIR} && \
 		cd ${QFS_PYTHON_DIR} && \
@@ -159,7 +177,10 @@ python: build
 		ln -s ../../../src/cc/access/kfs_setup.py setup.py && \
 		python3 -m venv .venv && \
 		. .venv/bin/activate && python -m pip install build && \
-		python -m build -w . ; \
+		python -m build -w . && \
+		if ls -1 dist/qfs*.whl >/dev/null 2>&1; then \
+			cp dist/qfs*.whl ../lib/; \
+		fi ; \
 	else \
 		echo 'python3 module venv is not available'; \
 	fi
@@ -167,14 +188,16 @@ python: build
 .PHONY: mintest
 mintest: hadoop-jars python
 	cd build/${BUILD_TYPE} && \
+	qfs_output_dir=`cd ../.. && pwd`/${QFS_OUTPUT_DIR} && \
 	../../src/test-scripts/qfstest.sh \
 		`${QFS_PYTHON_TEST_OPTION}` \
-		-install-prefix . -auth ${QFSTEST_OPTIONS}
+		-install-prefix "$$qfs_output_dir" -auth ${QFSTEST_OPTIONS}
 
 .PHONY: test
 test: mintest
 	cd build/${BUILD_TYPE} && \
-	installbindir=`pwd`/bin && \
+	qfs_output_dir=`cd ../.. && pwd`/${QFS_OUTPUT_DIR} && \
+	installbindir=$$qfs_output_dir/bin && \
 	metadir=$$installbindir && \
 	export metadir && \
 	chunkdir=$$installbindir && \
@@ -192,7 +215,7 @@ test: mintest
 		echo '--------- Test without authentication --------' && \
 		../../src/test-scripts/qfstest.sh \
 			`${QFS_PYTHON_TEST_OPTION}` \
-			-install-prefix . -noauth ${QFSTEST_OPTIONS} ; \
+			-install-prefix "$$qfs_output_dir" -noauth ${QFSTEST_OPTIONS} ; \
 	fi
 
 .PHONY: rat
@@ -201,4 +224,4 @@ rat: dir
 
 .PHONY: clean
 clean:
-	rm -rf build
+	rm -rf build ${QFS_OUTPUT_DIR}
diff --git a/benchmarks/mstress/CMakeLists.txt b/benchmarks/mstress/CMakeLists.txt
index 2f168d4a1..e6be75856 100644
--- a/benchmarks/mstress/CMakeLists.txt
+++ b/benchmarks/mstress/CMakeLists.txt
@@ -20,6 +20,8 @@
 #
 
 add_executable(mstress_client EXCLUDE_FROM_ALL mstress_client.cc)
+qfs_set_target_runtime_output_dir("${CMAKE_CURRENT_BINARY_DIR}"
+    mstress_client)
 
 if(USE_STATIC_LIB_LINKAGE)
     add_dependencies(mstress_client kfsClient)
diff --git a/examples/cc/CMakeLists.txt b/examples/cc/CMakeLists.txt
index d87e0ac0c..e12aa000d 100644
--- a/examples/cc/CMakeLists.txt
+++ b/examples/cc/CMakeLists.txt
@@ -32,5 +32,8 @@ else (USE_STATIC_LIB_LINKAGE)
     add_dependencies (qfssample kfsClient-shared)
 endif (USE_STATIC_LIB_LINKAGE)
 
+qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/examples"
+    qfssample)
+
 install (TARGETS qfssample
     RUNTIME DESTINATION bin/examples)
diff --git a/src/cc/devtools/CMakeLists.txt b/src/cc/devtools/CMakeLists.txt
index f527b2086..9b2f304be 100644
--- a/src/cc/devtools/CMakeLists.txt
+++ b/src/cc/devtools/CMakeLists.txt
@@ -55,6 +55,8 @@ foreach (exe_file ${exe_files})
         )
     endif (USE_STATIC_LIB_LINKAGE)
 endforeach (exe_file)
+qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/devtools"
+    ${exe_files})
 
 #
 install (TARGETS ${exe_files}
diff --git a/src/cc/emulator/CMakeLists.txt b/src/cc/emulator/CMakeLists.txt
index 1c0b8caa6..bd00d91d3 100644
--- a/src/cc/emulator/CMakeLists.txt
+++ b/src/cc/emulator/CMakeLists.txt
@@ -52,6 +52,8 @@ foreach (exe_file ${exe_files})
         kfsEmulator
     )
 endforeach (exe_file)
+qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/emulator"
+    ${exe_files})
 
 if (CMAKE_SYSTEM_NAME STREQUAL "SunOS")
     target_link_libraries(kfsEmulator mtmalloc)
diff --git a/src/cc/qcrs/CMakeLists.txt b/src/cc/qcrs/CMakeLists.txt
index c4a6efb7a..28c31a05e 100644
--- a/src/cc/qcrs/CMakeLists.txt
+++ b/src/cc/qcrs/CMakeLists.txt
@@ -166,6 +166,8 @@ add_executable (${rsmktablebin} mktable_main.c)
 target_link_libraries (${rstestbin} kfsrs)
 add_dependencies (${rstestbin} kfsrs)
 add_dependencies (${rsmktablebin} kfsrs)
+qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/devtools"
+    ${rstestbin} ${rsmktablebin})
 
 install (TARGETS kfsrs kfsrs-shared
         LIBRARY DESTINATION lib
diff --git a/src/cc/qfsc/CMakeLists.txt b/src/cc/qfsc/CMakeLists.txt
index f66d91acb..3b54fffdb 100644
--- a/src/cc/qfsc/CMakeLists.txt
+++ b/src/cc/qfsc/CMakeLists.txt
@@ -44,6 +44,8 @@ target_link_libraries (qfsc-shared
 
 add_executable (test-qfsc test-qfsc.c)
 set_target_properties (test-qfsc PROPERTIES LINKER_LANGUAGE CXX)
+qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/devtools"
+    test-qfsc)
 
 if (USE_STATIC_LIB_LINKAGE)
     add_dependencies (test-qfsc qfsc)
diff --git a/src/cc/tools/CMakeLists.txt b/src/cc/tools/CMakeLists.txt
index 74ca9c395..5f71fa688 100644
--- a/src/cc/tools/CMakeLists.txt
+++ b/src/cc/tools/CMakeLists.txt
@@ -102,6 +102,8 @@ foreach (exe_file ${exe_files})
         target_link_libraries (${exe_file} tools-shared)
     endif (USE_STATIC_LIB_LINKAGE)
 endforeach (exe_file)
+qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/tools"
+    ${exe_files})
 
 #
 install (TARGETS ${exe_files} tools

From b2fcfb5262794690a6e06538509b2728bced1715 Mon Sep 17 00:00:00 2001
From: zhangzhibiao <zhangzhibiao@bigo.sg>
Date: Mon, 25 May 2026 13:43:36 +0800
Subject: [PATCH 2/7] Improve mstress local benchmark behavior

---
 benchmarks/mstress/mstress.py        | 70 +++++++++++++++++++---------
 benchmarks/mstress/mstress_client.cc | 50 ++++++++++----------
 benchmarks/mstress/mstress_plan.py   |  7 ++-
 benchmarks/mstress/mstress_run.py    | 29 ++++++------
 4 files changed, 91 insertions(+), 65 deletions(-)

diff --git a/benchmarks/mstress/mstress.py b/benchmarks/mstress/mstress.py
index 173dd5d4b..2f3d0a14b 100755
--- a/benchmarks/mstress/mstress.py
+++ b/benchmarks/mstress/mstress.py
@@ -162,14 +162,26 @@ def PrintMemoryUsage(opts):
             Globals.SERVER_KEYWORD,
         )
 
-    proc = subprocess.Popen(
-        ["ssh", opts.server, psCmd],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-    )
+    if opts.server in ("localhost", "127.0.0.1"):
+        proc = subprocess.Popen(
+            [psCmd],
+            shell=True,
+            executable="/bin/bash",
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+    else:
+        proc = subprocess.Popen(
+            ["ssh", opts.server, psCmd],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
     result = proc.communicate()
     if result and len(result[0].strip()) > 0:
-        print("Memory usage %sKB" % result[0].strip())
+        memory = result[0].strip()
+        if not isinstance(memory, str):
+            memory = memory.decode("utf-8", "replace")
+        print("Memory usage %sKB" % memory)
     else:
         print("Memory usage <unknown> KB")
 
@@ -189,7 +201,7 @@ def RunMStressMaster(opts, hostsList):
     # print 'Master: called with %r, %r' % (opts, hostsList)
 
     startTime = datetime.datetime.now()
-    if RunMStressMasterTest(opts, hostsList, "create"):
+    if not RunMStressMasterTest(opts, hostsList, "create"):
         return False
     deltaTime = datetime.datetime.now() - startTime
     print(
@@ -200,7 +212,7 @@ def RunMStressMaster(opts, hostsList):
     print("==========================================")
 
     startTime = datetime.datetime.now()
-    if RunMStressMasterTest(opts, hostsList, "stat"):
+    if not RunMStressMasterTest(opts, hostsList, "stat"):
         return False
     deltaTime = datetime.datetime.now() - startTime
     print(
@@ -210,7 +222,7 @@ def RunMStressMaster(opts, hostsList):
     print("==========================================")
 
     startTime = datetime.datetime.now()
-    if RunMStressMasterTest(opts, hostsList, "readdir"):
+    if not RunMStressMasterTest(opts, hostsList, "readdir"):
         return False
     deltaTime = datetime.datetime.now() - startTime
     print(
@@ -221,10 +233,10 @@ def RunMStressMaster(opts, hostsList):
 
     if opts.leave_files:
         print("\nNot deleting files because of -l option")
-        return False
+        return True
 
     startTime = datetime.datetime.now()
-    if RunMStressMasterTest(opts, hostsList, "delete"):
+    if not RunMStressMasterTest(opts, hostsList, "delete"):
         return False
     deltaTime = datetime.datetime.now() - startTime
     print(
@@ -273,16 +285,30 @@ def RunMStressMasterTest(opts, hostsList, test):
             + opts.filesystem
             + ".slave.log"
         )
-        p = subprocess.Popen(
-            [
-                "/usr/bin/ssh",
-                client,
-                "%s -c %s -k %s >& %s"
-                % (ssh_cmd, client, clientHostMapping[client], slaveLogfile),
-            ],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+        slave_cmd = "%s -c %s -k %s >& %s" % (
+            ssh_cmd,
+            client,
+            clientHostMapping[client],
+            slaveLogfile,
         )
+        if client in ("localhost", "127.0.0.1"):
+            p = subprocess.Popen(
+                [slave_cmd],
+                shell=True,
+                executable="/bin/bash",
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+        else:
+            p = subprocess.Popen(
+                [
+                    "/usr/bin/ssh",
+                    client,
+                    slave_cmd,
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
         running_procs[p] = client
 
     success = True
@@ -323,7 +349,7 @@ def RunMStressMasterTest(opts, hostsList, test):
             else:
                 sys.stdout.write(".")
             sys.stdout.flush()
-            time.sleep(0.5)
+            time.sleep(0.05)
     return success
 
 
@@ -439,7 +465,7 @@ def RunMStressSlave(opts, clientsPerHost):
             else:
                 sys.stdout.write(".")
             sys.stdout.flush()
-            time.sleep(0.5)
+            time.sleep(0.05)
     return success
 
 
diff --git a/benchmarks/mstress/mstress_client.cc b/benchmarks/mstress/mstress_client.cc
index 15f416de4..96a7c1620 100644
--- a/benchmarks/mstress/mstress_client.cc
+++ b/benchmarks/mstress/mstress_client.cc
@@ -35,8 +35,8 @@
 #include <sstream>
 #include <string>
 #include <vector>
-#include <queue>
 #include <algorithm>
+#include <deque>
 
 #if __cplusplus >= 201103L
 #include <random>
@@ -185,11 +185,9 @@ void hexout(char* str, int len) {
   printf("\n");
 }
 
-void myitoa(int n, char* buf)
+void myitoa(int n, char* buf, size_t len = 32)
 {
-  static char result[32];
-  snprintf(result, 32, "%d", n);
-  strcpy(buf, result);
+  snprintf(buf, len, "%d", n);
 }
 
 //Return a random permutation of numbers in [0..range).
@@ -343,7 +341,7 @@ int CreateDFSPaths(Client* client, AutoCleanupKfsClient* kfs, int level, int* cr
   char name[512];
   strncpy(name, client->prefix_.c_str(), sizeof(name) / sizeof(name[0]) - 1);
   for (int i = 0; i < client->inodesPerLevel_; i++) {
-    myitoa(i, name + client->prefixLen_);
+    myitoa(i, name + client->prefixLen_, sizeof(name) - client->prefixLen_);
     client->path_.Push(name);
     //hexout(client->path_.actualPath_, client->path_.len_ + 3);
 
@@ -433,16 +431,16 @@ int StatDFSPaths(Client* client, AutoCleanupKfsClient* kfs) {
 
     for (int d = 0; d < client->levels_; d++) {
       int randIdx = rand() % client->inodesPerLevel_;
-      myitoa(randIdx, name + client->prefixLen_);
+      myitoa(randIdx, name + client->prefixLen_, sizeof(name) - client->prefixLen_);
       client->path_.Push(name);
       //fprintf(logFile, "Stat: path now is %s\n", client->path_.actualPath_);
     }
     //fprintf(logFile, "Stat: doing stat on [%s]\n", client->path_.actualPath_);
 
     KFS::KfsFileAttr attr;
-    int err = kfsClient->Stat(os.str().c_str(), attr);
+    int err = kfsClient->Stat(client->path_.String(), attr);
     if (err) {
-      fprintf(logFile, "error doing stat on %s\n", os.str().c_str());
+      fprintf(logFile, "error doing stat on %s\n", client->path_.String());
       return err;
     }
 
@@ -466,14 +464,15 @@ int ListDFSPaths(Client* client, AutoCleanupKfsClient* kfs) {
   gettimeofday(&tvAlpha, NULL);
   int inodeCount = 0;
 
-  queue<string> pending;
+  deque<string> pending;
   ostringstream os;
   os << TEST_BASE_DIR << "/" << client->hostName_ + "_" << client->processName_;
-  pending.push(os.str());
+  pending.push_back(os.str());
 
   while (!pending.empty()) {
-    string parent = pending.front();
-    pending.pop();
+    string parent;
+    parent.swap(pending.front());
+    pending.pop_front();
     //fprintf(logFile, "readdir on parent [%s]\n", parent.c_str());
     vector<KFS::KfsFileAttr> children;
     int err = kfsClient->ReaddirPlus(parent.c_str(), children);
@@ -482,20 +481,19 @@ int ListDFSPaths(Client* client, AutoCleanupKfsClient* kfs) {
       return err;
     }
     while (!children.empty()) {
-      string child = children.back().filename;
-      bool isDir = children.back().isDirectory;
-      children.pop_back();
+      const KFS::KfsFileAttr& childAttr = children.back();
+      const string& child = childAttr.filename;
+      bool isDir = childAttr.isDirectory;
       //fprintf(logFile, "  Child = %s inodeCount=%d\n", child.c_str(), inodeCount);
-      if (child == "." ||
-          child == "..") {
-        continue;
-      }
-      inodeCount ++;
-      if (isDir) {
-        string nextParent = parent + "/" + child;
-        pending.push(nextParent);
-        //fprintf(logFile, "  Adding next parent [%s]\n", nextParent.c_str());
+      if (child != "." && child != "..") {
+        inodeCount ++;
+        if (isDir) {
+          string nextParent = parent + "/" + child;
+          pending.push_back(nextParent);
+          //fprintf(logFile, "  Adding next parent [%s]\n", nextParent.c_str());
+        }
       }
+      children.pop_back();
       if (inodeCount > 0 && inodeCount % COUNT_INCR == 0) {
         fprintf(logFile, "Readdir paths so far: %d\n", inodeCount);
       }
@@ -546,7 +544,7 @@ int RemoveDFSPaths(Client* client, AutoCleanupKfsClient* kfs) {
     while (lev < client->levels_) {
       pos = idx / client->inodesPerLevel_;
       delta = idx - (pos * client->inodesPerLevel_);
-      myitoa(delta, sfx);
+      myitoa(delta, sfx, sizeof(sfx));
       if (pathSoFar.length()) {
         pathSoFar = client->prefix_ + sfx + "/" + pathSoFar;
       } else {
diff --git a/benchmarks/mstress/mstress_plan.py b/benchmarks/mstress/mstress_plan.py
index 6388e531c..4cfcbc855 100755
--- a/benchmarks/mstress/mstress_plan.py
+++ b/benchmarks/mstress/mstress_plan.py
@@ -183,6 +183,9 @@ def main():
     print("==> Created planfile: %s" % opts.output_file)
     print("copying file %s to all client hosts" % opts.output_file)
     for client in hostlist:
+        if client in ("localhost", "127.0.0.1"):
+            print("available %s on %s" % (opts.output_file, client))
+            continue
         p = subprocess.Popen(
             [
                 "/usr/bin/scp",
@@ -195,7 +198,9 @@ def main():
             if ret is None:
                 time.sleep(0.5)
             else:
-                print("transfered %s to %s" % (opts.output_file, client))
+                if ret != 0:
+                    sys.exit("failed to transfer %s to %s" % (opts.output_file, client))
+                print("transferred %s to %s" % (opts.output_file, client))
                 break
 
 
diff --git a/benchmarks/mstress/mstress_run.py b/benchmarks/mstress/mstress_run.py
index 05dde0899..ef564d3e8 100755
--- a/benchmarks/mstress/mstress_run.py
+++ b/benchmarks/mstress/mstress_run.py
@@ -45,7 +45,7 @@ def NumFiles2Stat():
             Params.INODES_PER_LEVEL**Params.PATH_LEVELS
             * Params.CLIENTS_PER_HOST
             * len(Params.CLIENT_HOSTS.split(","))
-            / 2
+            // 2
         )
 
     NumFiles2Stat = staticmethod(NumFiles2Stat)
@@ -136,33 +136,30 @@ def Execute(type, args):
         % type
     )
 
-    result = ""
+    result = []
     proc = subprocess.Popen(
-        args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+        args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+        universal_newlines=True
     )
-    while proc.poll() is None:
-        output = proc.stdout.read(1)
-        result += output
+    for output in iter(proc.stdout.readline, ""):
+        result.append(output)
         sys.stdout.write(output)
         sys.stdout.flush()
-
-    output = proc.stdout.read()
-    result += output
-    sys.stdout.write(output)
-    sys.stdout.flush()
     proc.wait()
 
-    return result
+    return "".join(result)
 
 
 def PrintResult(type, result):
     PrintMsg("\nBenchmark results for '%s':" % type)
     for m in re.findall(r"(\w+) test took (\S+) sec", result):
         PrintMsg("%-10s: %s sec" % (m[0], m[1]))
-    PrintMsg(
-        "\n%s\n=========================================="
-        % re.search(r"Memory usage .*$", result, re.MULTILINE).group(0)
-    )
+    memory = re.search(r"Memory usage .*", result, re.MULTILINE)
+    if memory:
+        PrintMsg(
+            "\n%s\n=========================================="
+            % memory.group(0)
+        )
 
 
 def ParseArgs():

From da95513e4289f5a52c0b2f3029642b55e0851927 Mon Sep 17 00:00:00 2001
From: zhangzhibiao <zhangzhibiao@bigo.sg>
Date: Mon, 25 May 2026 17:07:56 +0800
Subject: [PATCH 3/7] opt lock

---
 MetaTree-Lock-Optimization.md     | 227 ++++++++++++++++++++++++++++++
 scripts/run_mstress_create.sh     |  39 +++++
 scripts/start_clean_metaserver.sh |  86 +++++++++++
 3 files changed, 352 insertions(+)
 create mode 100644 MetaTree-Lock-Optimization.md
 create mode 100755 scripts/run_mstress_create.sh
 create mode 100755 scripts/start_clean_metaserver.sh

diff --git a/MetaTree-Lock-Optimization.md b/MetaTree-Lock-Optimization.md
new file mode 100644
index 000000000..4330d1eb5
--- /dev/null
+++ b/MetaTree-Lock-Optimization.md
@@ -0,0 +1,227 @@
+# MetaTree Lock Optimization Plan
+
+## Background
+
+The create-file benchmark shows poor scaling when increasing
+`metaServer.clientThreadCount`. In the measured runs, `INFO + 4` client threads
+was faster than `INFO + 20`, which points to lock contention inside the meta
+server rather than chunkserver or data path bottlenecks.
+
+For empty-file create, the hot path is:
+
+1. `ClientThread::DispatchStart()`
+2. `submit_request()`
+3. `MetaRequest::SubmitBegin()`
+4. `LogWriter::Enqueue()`
+5. `MetaCreate::start()`
+6. `MetaCreate::handle()`
+7. `Tree::create()`
+8. `LogWriter::ScheduleFlush()`
+
+`Tree::create()` mutates the global metadata tree by doing lookup, optional
+remove, fid allocation, dentry/fattr insertion, and count updates.
+
+## Current Locking Problem
+
+Before this optimization, client threads held the global net dispatch mutex while
+processing pending metadata requests:
+
+```text
+dispatch mutex
+  submit_request()
+    MetaRequest::Submit()
+      LogWriter::Enqueue()
+      MetaCreate::handle()
+        metatree.create()
+  LogWriter::ScheduleFlush()
+```
+
+This makes `metaServer.clientThreadCount` scale poorly. More client threads
+mainly increase contention on the same dispatch mutex.
+
+## Important Constraint
+
+The metadata tree is not currently safe for simple per-parent-directory locking.
+It is implemented as a single global B-tree. Even creates in different parent
+directories can modify shared B-tree nodes, split internal nodes, update the
+root, or touch shared indexes.
+
+Therefore, this is unsafe as a direct first step:
+
+```text
+lock(parent_dir)
+  Tree::create(parent_dir, name)
+```
+
+That would protect directory-level semantics but not the global B-tree data
+structure.
+
+## Implemented First Step
+
+The first step separates the broad dispatch lock from metadata mutation and log
+writer state.
+
+### 1. Add a dedicated metadata request mutex
+
+`submit_request()` now takes a dedicated metadata processing mutex before calling
+`MetaRequest::Submit()`.
+
+This preserves existing metatree safety while removing metadata processing from
+the net dispatch mutex.
+
+```text
+meta request mutex
+  MetaRequest::Submit()
+    MetaCreate::handle()
+      metatree.create()
+```
+
+### 2. Shrink dispatch mutex scope
+
+`ClientThread::DispatchStart()` now keeps the dispatch mutex only around fork
+coordination and auth context update. It does not hold the dispatch mutex while
+processing the request batch.
+
+This changes the lock shape to:
+
+```text
+dispatch mutex
+  PrepareToFork()
+  auth context update
+  ForkDone()
+
+meta request mutex
+  submit_request()
+```
+
+### 3. Protect LogWriter state with LogWriter mutex
+
+Moving request processing out of the dispatch mutex means LogWriter can no
+longer rely on dispatch serialization. The following paths now explicitly use
+`LogWriter::mMutex`:
+
+```text
+LogWriter::Enqueue()
+LogWriter::RequestCommitted()
+LogWriter::ScheduleFlush()
+```
+
+This protects pending queues, commit state, and flush scheduling when multiple
+client threads reach the log writer concurrently.
+
+## What This Does Not Yet Solve
+
+This first step does not make `Tree::create()` itself parallel across parent
+directories. It deliberately keeps metadata mutation serialized through the
+metadata request mutex.
+
+The goal is to remove one oversized outer lock and introduce clearer lock
+ownership:
+
+```text
+dispatch state -> dispatch mutex
+metadata mutation -> metadata request mutex
+log writer state -> log writer mutex
+```
+
+This is a safe prerequisite for deeper metatree concurrency work.
+
+## Next Steps Toward True MetaTree Concurrency
+
+### Step 1: Add profiling around the new lock boundaries
+
+Measure:
+
+```text
+dispatch mutex wait / hold time
+metadata request mutex wait / hold time
+LogWriter mutex wait / hold time
+Tree::create() latency
+Tree::lookup() latency
+Tree::link() latency
+log flush batch size and latency
+```
+
+This confirms whether contention moved from dispatch mutex to metadata request
+mutex or LogWriter.
+
+### Step 2: Split read-only and mutation requests
+
+Introduce a metadata operation classification:
+
+```text
+read-only ops
+mutation ops
+log-dependent mutation ops
+```
+
+Read-only requests can eventually run under a shared/read lock, while mutation
+requests keep exclusive protection.
+
+### Step 3: Refactor metatree storage for sharding
+
+True per-directory create parallelism needs the data structure to stop using one
+global mutable B-tree for all dentries/fattrs.
+
+Candidate direction:
+
+```text
+fid/fattr index: separately protected or sharded by fid
+dentry index: sharded by parent fid
+path cache: separately protected or disabled on mutation-heavy workloads
+directory counters: parent-chain locking with stable lock ordering
+```
+
+Only after this split is it safe to use parent-directory locks for create.
+
+### Step 4: Add parent-directory locking
+
+Once dentry storage is sharded by parent fid:
+
+```text
+lock(parent_dir)
+  check permissions
+  lookup child name
+  allocate fid
+  insert dentry in parent shard
+  insert fattr in fid shard
+  update parent counters
+```
+
+Lock ordering must be explicit. Rename is the main hard case because it touches
+two parent directories and may update path cache and subtree invariants.
+
+### Step 5: Validate replay and transaction ordering
+
+Create is a logged operation. Any concurrency change must preserve:
+
+```text
+log sequence order
+fid seed replay correctness
+idempotent request behavior
+rename/create/remove ordering
+checkpoint consistency
+```
+
+Log ordering can remain serialized even if independent metatree mutations become
+parallel internally.
+
+## Verification
+
+Current first-step build verification:
+
+```bash
+cmake --build bld --target metaserver -j4
+```
+
+The target builds successfully.
+
+## Risk Notes
+
+The safe first step may improve throughput if the dispatch mutex was the main
+contention point. If the bottleneck is now the metadata request mutex or log
+writer, throughput may not improve significantly.
+
+Do not replace the metadata request mutex with a parent-directory lock until the
+global B-tree has been refactored or otherwise proven safe for concurrent
+mutation.
diff --git a/scripts/run_mstress_create.sh b/scripts/run_mstress_create.sh
new file mode 100755
index 000000000..a7d7fcf81
--- /dev/null
+++ b/scripts/run_mstress_create.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+cd /work/bigo-qfs
+
+PLAN_FILE="${1:-output/mstress_1m_after_lock.plan}"
+QFS="bld/output/bin/tools/qfs"
+QFS_CFG="qfsbase/client/clidefault.prp"
+MSTRESS="bld/benchmarks/mstress/mstress.py"
+META_HOST="localhost"
+META_PORT="20000"
+
+if [ ! -f "${PLAN_FILE}" ]; then
+  echo "Plan file not found: ${PLAN_FILE}" >&2
+  exit 1
+fi
+
+echo "Checking metaserver..."
+bld/output/bin/tools/qfsping -m -s "${META_HOST}" -p "${META_PORT}"
+
+echo "Cleaning /mstress..."
+"${QFS}" \
+  -D dfs.force.remove=true \
+  -cfg "${QFS_CFG}" \
+  -rmr /mstress >/dev/null 2>&1 || true
+
+echo "Running create benchmark with plan: ${PLAN_FILE}"
+python "${MSTRESS}" \
+  -m slave \
+  -f qfs \
+  -s "${META_HOST}" \
+  -p "${META_PORT}" \
+  -t create \
+  -a "${PLAN_FILE}" \
+  -c localhost \
+  -k localhost
+
+echo "Summary:"
+rg -n "paths created|failed|ERROR|FATAL" "${PLAN_FILE}"* 2>/dev/null || true
diff --git a/scripts/start_clean_metaserver.sh b/scripts/start_clean_metaserver.sh
new file mode 100755
index 000000000..cbfcd1db6
--- /dev/null
+++ b/scripts/start_clean_metaserver.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+cd /work/bigo-qfs
+
+META_BIN="bld/output/bin/metaserver"
+META_CONF="qfsbase/meta/conf/MetaServer.prp"
+META_LOG="qfsbase/meta/MetaServer.log"
+META_OUT="qfsbase/meta/MetaServer.out"
+META_BASE="qfsbase/meta"
+TS="$(date +%Y%m%d_%H%M%S)"
+
+stop_pid_file() {
+  local pid_file="$1"
+  if [ -f "${pid_file}" ]; then
+    local pid
+    pid="$(cat "${pid_file}" || true)"
+    if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
+      kill "${pid}" || true
+    fi
+    rm -f "${pid_file}"
+  fi
+}
+
+echo "Stopping existing metaserver, if any..."
+stop_pid_file "${META_BASE}/metaserver.pid"
+pkill -f "${META_BIN} ${META_CONF}" 2>/dev/null || true
+
+echo "Stopping existing chunkservers, if any..."
+for idx in 1 2 3; do
+  stop_pid_file "qfsbase/chunk${idx}/chunkserver.pid"
+done
+pkill -f "bld/output/bin/chunkserver qfsbase/chunk" 2>/dev/null || true
+sleep 1
+
+echo "Archiving old meta logs/checkpoints..."
+mkdir -p "${META_BASE}"
+if [ -d "${META_BASE}/logs" ]; then
+  mv "${META_BASE}/logs" "${META_BASE}/logs.bak.${TS}"
+fi
+if [ -d "${META_BASE}/checkpoints" ]; then
+  mv "${META_BASE}/checkpoints" "${META_BASE}/checkpoints.bak.${TS}"
+fi
+mkdir -p "${META_BASE}/logs" "${META_BASE}/checkpoints"
+
+echo "Archiving old chunkserver local state..."
+for idx in 1 2 3; do
+  CHUNK_BASE="qfsbase/chunk${idx}"
+  for path in "${CHUNK_BASE}"/chunkdir*; do
+    if [ -d "${path}" ]; then
+      mv "${path}" "${path}.bak.${TS}"
+      mkdir -p "${path}"
+    fi
+  done
+done
+
+echo "Creating clean filesystem..."
+"${META_BIN}" \
+  -c \
+  "${META_CONF}" \
+  "${META_LOG}" \
+  > "${META_OUT}" 2>&1
+
+echo "Starting metaserver..."
+setsid -f "${META_BIN}" \
+  "${META_CONF}" \
+  "${META_LOG}" \
+  >> "${META_OUT}" 2>&1
+
+sleep 2
+
+echo "Starting chunkservers..."
+for idx in 1 2 3; do
+  setsid -f bld/output/bin/chunkserver \
+    "qfsbase/chunk${idx}/conf/ChunkServer.prp" \
+    "qfsbase/chunk${idx}/ChunkServer.log" \
+    > "qfsbase/chunk${idx}/ChunkServer.out" 2>&1
+done
+
+sleep 5
+
+echo "Process:"
+ps -ef | awk "/bld\/output\/bin\/metaserver|bld\/output\/bin\/chunkserver/ && !/awk/ {print}"
+
+echo "Ping:"
+bld/output/bin/tools/qfsping -m -s localhost -p 20000

From d11e34d9fdf287a3067d9e5fede7baba3f369b14 Mon Sep 17 00:00:00 2001
From: zhangzhibiao <zhangzhibiao@bigo.sg>
Date: Mon, 25 May 2026 17:58:17 +0800
Subject: [PATCH 4/7] add rfc

---
 .../RFC-0001-memory-native-metadata-layer.md  | 908 ++++++++++++++++++
 1 file changed, 908 insertions(+)
 create mode 100644 docs/rfc/RFC-0001-memory-native-metadata-layer.md

diff --git a/docs/rfc/RFC-0001-memory-native-metadata-layer.md b/docs/rfc/RFC-0001-memory-native-metadata-layer.md
new file mode 100644
index 000000000..6074bfe2c
--- /dev/null
+++ b/docs/rfc/RFC-0001-memory-native-metadata-layer.md
@@ -0,0 +1,908 @@
+# RFC-0001: 内存原生元数据层（Memory-Native Metadata Layer）
+
+| 字段 | 值 |
+|------|-----|
+| **状态** | Draft |
+| **日期** | 2026-05-25 |
+| **相关** | QFS `metatree`（B+ 树）、`LogWriter`、HDFS NameNode edit log + FSImage |
+| **动机来源** | CREATE 延迟分析、与 HDFS NN 路径对比、绿场元数据设计讨论 |
+
+---
+
+## 摘要
+
+本 RFC 提议为 QFS 类分布式文件系统定义一套**从 0 设计的内存原生元数据层**：命名空间用 **按目录分片的哈希索引 + 全局 inode 表** 维护，持久化采用 **edit log + HDFS 式用户快照（引用/COW）+ 周期性 Checkpoint（FSImage）**，与当前 **单一 B+ 树（`metatree`）+ 先 WAL 后改树** 的实现路线对比，并给出实现与分阶段交付路径（**不含**从现有 B+ 树/checkpoint 的迁移方案）。
+
+目标是在可比持久化语义下，将 **`create` / `lookup` 的热路径** 从「多次 B+ 树 descent + 双 insert + 全局串行」收敛为「O(1) 内存索引 + 摊销组提交 fsync」，并保留 VR/幂等等生产特性。
+
+---
+
+## 1. 背景与动机
+
+### 1.1 当前 QFS 元数据路径（CREATE）
+
+空文件 `CREATE` 的典型路径：
+
+```text
+ClientThread → submit_request() [全局互斥]
+  → MetaCreate::start()           [校验，不改树]
+  → LogWriter::Enqueue()          [写 transaction log]
+  → （log committed 后）MetaCreate::handle()
+  → Tree::create()
+       → getFattr(parent)          [B+ 树查找]
+       → lookup(parent, name)      [B+ 树查找]
+       → link()
+            → insert(MetaDentry)   [B+ 树插入 #1]
+            → insert(MetaFattr)     [B+ 树插入 #2]
+```
+
+特征：
+
+- 命名空间、dentry、fattr、chunkinfo 混在同一棵 **B+ 树**（`kfstree.h` 明确为 B+ 树）。
+- **先 durable log，再** 修改内存树（`SubmitBegin` + `LogWriter` 队列）。
+- `MetaIdempotentRequest` 将 `logAction` 设为 **`kLogAlways`**，成功路径几乎必落 log。
+- 核心处理在 **`submit_request` 全局锁** 下串行（见 `NetDispatch.cc` 注释）。
+
+### 1.2 HDFS NameNode 对照
+
+HDFS 将问题拆成两层：
+
+| 层 | 实现要点 |
+|----|----------|
+| 运行时 | 全内存 inode 树；目录下按名索引（hash/map）；`create` 主要为内存挂接 |
+| 持久化 | Edit log（操作记录）+ FSImage/checkpoint；fsync 可组提交 |
+
+Chunk 分配通常在 **首次 write / addBlock**，而非 `create` 本身，因此 NN 上 `create` 常数项小。
+
+### 1.3 结论
+
+QFS **并非不能** 采用 HDFS NN 式布局；当前选择是 **统一 B+ 树 + 树形 checkpoint** 的历史工程路线。在 CREATE 延迟与元数据 QPS 成为瓶颈时，绿场或下一代元数据层值得单独设计，而非仅在 B+ 树上做局部锁优化（参见仓库内 `MetaTree-Lock-Optimization.md`）。
+
+---
+
+## 2. 目标与非目标
+
+### 2.1 目标
+
+1. **`create` / `lookup`（按 fid + name）**：小目录 **O(1) 均摊**（Small）；超大目录 **O(log N)**（Large，§4.2）；均无全局 `metatree` descent。
+2. **持久化**：edit log + §6.3 用户快照 + §6.4 Checkpoint；支持 **`sync=none | batch | always`** 三档。
+3. **吞吐**：通过 **目录分片锁 + log 单写线程组提交** 提升并行度。
+4. **语义**：保留客户端 **父目录 fid**、幂等 `(session_id, op_id)`、VR/quorum 复制（与现有 MetaServer 部署模型兼容）。
+5. **规模（单机）**：假定 **单个 MetaServer 进程、单机 RAM 容纳全部 namespace**（`DirIndex` + `InodeTable` + 可选 `BlockMap`）；容量规划为运维/部署话题，本 RFC 不定义上限模型。
+6. **大目录（首版必做）**：单目录 **百万级** 子项时，`lookup` / `create` / `readdir` 不得退化为「单哈希桶长链遍历」；须使用 §4.2 的 **Small/Large 双布局** 与 **升格（promotion）** 机制。
+
+### 2.2 非目标（本 RFC 首版）
+
+- 不定义 chunk 数据路径、纠删码、LayoutManager 细节（仅要求 **BlockMap 与 namespace 解耦**）。
+- 不替换现有 ChunkServer 协议。
+- **不考虑**从现有 B+ 树 / checkpoint / transaction log 的**离线迁移、在线双写、回滚**（若需要，另起 RFC）。
+- **不考虑**元数据 **水平分片**（多 MetaServer 各管一段 namespace）、**冷 inode / namespace 换出**、单机内存超限后的分级存储（另起 RFC）。
+- 不实现完整 POSIX（符号链接、硬链接语义等可后续 RFC 补充）。
+
+本 RFC 假定 **绿场部署**：新集群以 v2 snapshot + edit log **冷启动**，或与现行 `metatree` 并行存在、不互通。
+
+文中 **「分片」** 若无特别说明，均指 **单进程内** 的锁分片 / `hash(fid) % N` 数据结构分片，**不是** 集群级 namespace 分片。
+
+---
+
+## 3. 提议架构
+
+### 3.1 逻辑分层
+
+```text
+┌─────────────────────────────────────────────────────────┐
+│  RPC 层：CREATE / LOOKUP / READDIR / REMOVE / RENAME …   │
+└───────────────────────────┬─────────────────────────────┘
+                            │
+┌───────────────────────────▼─────────────────────────────┐
+│  内存权威层（Authoritative In-Memory State）               │
+│  • Namespace：DirTable[parent_fid] → DirNode（Small/Large）│
+│  • InodeTable[fid] → Inode（属性、parent、类型、计数）      │
+│  • BlockMap[fid] → chunk 列表（可选独立模块/服务）          │
+│  • （无服务端 PathCache；路径缓存仅在客户端，见 §4.3）       │
+└───────────────────────────┬─────────────────────────────┘
+                            │ 仅追加操作记录
+┌───────────────────────────▼─────────────────────────────┐
+│  持久化层                                                 │
+│  • Edit Log（二进制 op，组提交 fsync）                     │
+│  • 用户快照：HDFS 式引用 + 文件级 COW（§6.3）               │
+│  • Checkpoint/FSImage：一致性点 N + 后台遍历（§6.4）          │
+│  • Quorum / VR 复制（复用现有 LogWriter/VR 基础设施）       │
+└─────────────────────────────────────────────────────────┘
+```
+
+### 3.2 与 QFS 现状的核心差异
+
+| 维度 | QFS 现状 | 本 RFC |
+|------|----------|--------|
+| 主索引 | 全局 B+ 树，dentry/fattr 不同 key | DirNode（Small hash / Large **复用 `kfstree` 每目录一棵 `Tree`**）+ InodeTable |
+| CREATE 索引操作 | 2× 全局 `insert` + 多次 `findLeaf` | 1× DirNode insert + 1× InodeTable insert |
+| 百万级单目录 | 同全局树叶子链/同桶冲突风险 | Large 布局 O(log N)，首版必做 promotion |
+| 持久化顺序 | 先 WAL committed，再 `handle()` | 临界区内改内存 + append log buffer；fsync 摊销 |
+| 用户快照 | （现 QFS 无同等机制） | `InodeRef` + 文件级 COW，创建 O(1)（§6.3） |
+| checkpoint | B+ 树页/节点序列化 | 模糊 FSImage（§6.4）+ replay txn>N |
+| chunk 元数据 | 同树 `KFS_CHUNKINFO` | BlockMap 分离，allocate 时再写 |
+
+---
+
+## 4. 内存数据结构
+
+### 4.1 Inode
+
+```text
+Inode {
+  fid:          u64          // 全局唯一，单调分配（单机）
+  type:         file | dir | symlink
+  parent_fid:   u64          // 父目录；根目录哨兵
+  mode, uid, gid, size, mtime, ctime, atime
+  nlink, flags               // 见 §8.4：WORM、dumpster 子树、striping 等
+  snapshottable: bool        // 目录可打快照（§6.3）
+  snap_ref_count: u32        // 被用户快照持有的 frozen 引用数（§6.3.6）；live inode 常为 0
+  replication | ec_policy    // 或仅指针，详细布局在 allocate 时设置
+  dir_child_count            // 仅目录；用于 readdir 分页提示
+  generation:   u64          // 每次 rename/unlink/rmdir/promotion 递增，供 cache 失效
+}
+```
+
+存储：`InodeTable` 为 `fid → Inode`，数组分片或 `flat_hash_map` 分片。
+
+### 4.2 目录索引 DirIndex（含大目录，首版必做）
+
+**问题**：若每个目录仅用一个全局 `HashMap` 且冲突用链表串接，则单目录 **百万文件** 时会出现极端哈希桶长链，`lookup` / `create` / `readdir` 退化为 **O(N)**，成为新瓶颈。
+
+**决策**：每个目录一个 **`DirNode`**，首版即实现 **Small（哈希）+ Large（有序索引）** 两种布局，并在子项数超过阈值时 **强制升格**（promotion），不是后续可选优化。
+
+#### 4.2.1 公共类型
+
+```text
+NameKey  = (name_hash: u64, name: string)   // name_hash = Hsieh(name)<<4，与现 MetaDentry 一致
+DirEntry = { child_fid, name }
+DirNode  = {
+  state:       SMALL | PROMOTING | LARGE   // 见 §4.2.6
+  generation:  u64                        // promotion 完成后递增，失效 readdir cookie
+  child_count: u64
+  body:        SmallDir | LargeDir        // PROMOTING 期间读者只访问 Small
+  staging:     LargeDir?                  // 仅 PROMOTING：构建完成前对读者不可见
+}
+```
+
+全局：`DirTable[parent_fid] → DirNode`（按 `parent_fid` 分片锁，§7；晋升见 §4.2.6）。
+
+#### 4.2.2 Small 布局（子项数 < `dir_large_threshold`）
+
+- 结构：**开放寻址** `flat_hash_map<NameKey, fid>`（Robin Hood 或等价），**禁止**无限链表冲突链。
+- **`lookup` / `create`**：均摊 **O(1)**；探测次数有硬上限 `max_probe`（如 16），插入时若接近满负荷或探测失败则 **触发升格** 而非继续堆链。
+- 默认阈值 **`dir_large_threshold = 4096`**（可配置 `meta.dir.largeThreshold`）。
+
+#### 4.2.3 Large 布局（子项数 ≥ 阈值，或 Small 无法安全插入）
+
+**决策：Large 布局直接复用当前 QFS B+ 树实现**（`kfstree.h` / `kfstree.cc`），不新写一套目录 B-tree。与全局 `metatree` 的差异仅是 **每目录一棵独立 `Tree` 实例**，键空间 scoped 在该 `parent_fid` 下。
+
+| 复用组件 | 路径 / 说明 |
+|----------|-------------|
+| 内部节点 | `Node`（`NKEY=170`，4096B 页式节点，`findplace` 二分，`split` / `merge`） |
+| 树操作 | `Tree::insert`、`Tree::del`、`lowerBound` / `findLeaf`、`LeafIter` |
+| 键 | 现有 `Key` / `PartialMatch`；叶键 **`Key(KFS_DENTRY, parent_fid, name_hash)`**，与现 `MetaDentry::keySelf()` 一致 |
+| 叶记录 | `MetaDentry`（或薄封装 `DirBTreeLeaf` 内嵌相同字段）；`matchSelf` 比对 `name` |
+| 内存 | `MetaNode::allocate` / `PoolAllocator`（与现 meta 节点相同） |
+
+```text
+LargeDir {
+  parent_fid:  fid_t
+  tree:        Tree          // 现 kfstree.Tree，非全局 metatree 单例
+}
+```
+
+- **语义**：逻辑上仍是「该目录下 name → child_fid」；物理上用 **一棵子树** 存该目录全部 `MetaDentry` 叶，**不再**插入全局 `metatree` 的混合 key 空间。
+- **`lookup` / `create`**：对该目录的 `Tree` 调用与现 `getDentry` / `insert` 相同逻辑（`findLeaf` + 叶链 `peer()` 扫同名 hash），**O(log N)**，无百万长链。
+- **`readdir`**：`LeafIter` 逻辑序遍历 + §5.4 **逻辑位置 cookie**（禁止裸指针）。
+- **升格（promotion）**：原子性与并发语义见 **§4.2.6**（`PROMOTING` 状态、写阻塞、读仍用 Small、staging 完成后一次性切换）。
+- **checkpoint / fsck**：Large 目录序列化可 **复用现 Node/Meta checkpoint 格式**；fsck 见 §8.5（`PROMOTING` 视为 transient，持久化快照中不应出现）。
+
+**不新写**：单独的目录 B-tree 节点类型、另一套 split/merge 或不同于 `Node` 页大小的树实现。
+
+#### 4.2.4 复杂度与验收（百万级单目录）
+
+| 操作 | Small | Large |
+|------|-------|-------|
+| lookup | O(1) 均摊，探测有界 | O(log N) |
+| create | O(1) 均摊或触发 O(N) 一次性 promotion | O(log N) |
+| readdir 一页 | O(page) 或扫描有界桶 | O(log N + page) |
+
+**禁止**：单桶链表长度 ∝ N、百万次指针追逐的「伪 O(1) 哈希表」。
+
+#### 4.2.5 与全局 `metatree` 的关系
+
+| | 全局 `metatree`（现 QFS） | Large `DirNode`（本 RFC） |
+|--|---------------------------|---------------------------|
+| 代码 | `kfstree` | **同一套** `kfstree` |
+| 实例 | 单例 `metatree`，混放 dentry/fattr/chunk | **每超大目录一个 `Tree`** |
+| create 副作用 | 可能 split 共享祖先内部节点 | 仅影响该目录子树 |
+| 小目录 | 也走全局树 | **Small `flat_hash`**，不进 B+ 树 |
+
+InodeTable、BlockMap **不再**进入任何 B+ 树；仅 **超大目录的子项列表** 使用 `Tree` 存 `MetaDentry` 叶。
+
+#### 4.2.6 晋升（Promotion）的原子性与可见性（已决）
+
+**问题**：§4.2.3 若在「半建成」的 Large `Tree` 上并发 `lookup`/`create`，可能看到 **不完整** 的 B+ 树或 Small/Large 双写混乱。
+
+**决策**：`DirNode` 增加 **`state`**；晋升在 **staging** 中构建 Large，通过 **一次性发布** 切换；晋升期间 **读走 Small、写阻塞或排队**。
+
+##### 状态机
+
+```text
+SMALL ──(触发晋升)──► PROMOTING ──(发布完成)──► LARGE
+                         │
+                         └── 失败回滚 ──► SMALL（见下）
+```
+
+| `state` | 读者 (`lookup`/`readdir`) | 写者 (`create`/`unlink`/`rename` 子项) |
+|---------|---------------------------|----------------------------------------|
+| **SMALL** | `body.small` | 正常；可能触发进入 PROMOTING |
+| **PROMOTING** | **仅** `body.small`（不读 `staging`） | **阻塞**于 `promote_cv` 或同目录写队列，直到 `LARGE` |
+| **LARGE** | `body.large.tree` | 正常 `kfstree` 路径 |
+
+##### 晋升算法（持有 `DirTable[parent]` 互斥或写锁）
+
+```text
+promote_small_to_large(parent_fid):
+  lock(dir)   // 目录分片写锁；阻塞其它写者，读者见下
+
+  1. assert(state == SMALL)
+  2. state = PROMOTING
+  3. staging.large = new Tree()          // 读者不可见
+  4. for entry in body.small:            // 只读 Small，不改 Small
+       staging.large.insert(MetaDentry(...))
+  5. // 一次性发布（原子切换可见布局）
+     body.large   = move(staging.large)
+     staging      = null
+     free(body.small)
+     state        = LARGE
+     generation++                         // 失效 readdir cookie / 客户端 path 缓存
+  6. broadcast(promote_cv)                // 唤醒排队写者
+  7. append EditLog(DIR_PROMOTE, parent_fid, generation)
+  unlock(dir)
+```
+
+- **「原子」含义**：在步骤 5 之前，任何 RPC **不可能** 观察到 `staging` 或半填充的 `body.large`；步骤 5 之后，**不可能** 再观察到 `body.small`。
+- 实现上可用 **同一把目录锁** 包裹步骤 2–6；步骤 5 的字段赋值顺序：`staging` 清空 → `body.large` 生效 → `state=LARGE` → 释放 `small`（避免读者看到 `LARGE` 但 body 仍为空）。
+
+##### 并发 `lookup` / `create`（与 §7 分片锁配合）
+
+| 操作 | `state == PROMOTING` 时行为 |
+|------|---------------------------|
+| **lookup** | 获取目录 **读锁**（或与写互斥的 `shared_lock`）：读 **`body.small` 快照**，与晋升线程不共享写；晋升 **不修改** Small，只读遍历。 |
+| **readdir** | 同 lookup；cookie 若带旧 `generation`，晋升完成后返回 **失效**，客户端重试。 |
+| **create** | 需目录 **写锁**：若 `PROMOTING`，**等待** 晋升完成（`promote_cv`），不得在半成品 Large 上 insert。 |
+| **触发晋升的 create** | 当前线程持写锁执行 `promote_small_to_large`，完成后在同一锁内对 **Large** 执行 insert。 |
+
+**不采用**：晋升过程中对活动 RPC 暴露「部分迁移」的 Large；不采用无 `PROMOTING` 标记、原地边建树边切换 `layout` 字段。
+
+##### 失败与恢复
+
+- 若步骤 4 失败：`state` 回滚 **SMALL**，丢弃 `staging`，`generation` 不变，唤醒等待者并返回错误。
+- Edit log 仅在 **成功** 步骤 7 记录 `DIR_PROMOTE`；replay 时目录应已为 **LARGE**（或从 snapshot 还原 layout 字段）。
+- §6.4 Checkpoint 扫描时：若发现 `PROMOTING`（崩溃中间态），按 **SMALL** 序列化并打标需 **重做 promotion** 或 fsck 修复（运维策略，首版可 panic 要求重放 log 修复）。
+
+##### 与 §5 热路径的衔接
+
+```text
+DirTable[parent].lookup(name):
+  lock_shared(dir)
+  switch (state):
+    SMALL | PROMOTING → return body.small.find(name)
+    LARGE             → return body.large.tree.lookup(...)
+
+DirTable[parent].insert(name, child_fid):
+  lock_exclusive(dir)
+  while (state == PROMOTING) wait(promote_cv)
+  if state == SMALL && need_promote(): promote_small_to_large()  // 仍持写锁
+  ... insert into active body ...
+```
+
+### 4.3 路径缓存（已决：仅客户端）
+
+**决策：不在 MetaServer 集群内维护、复制或共享 PathCache**（对比现 QFS 可选的 `metaServer.enablePathToFidCache`，本设计 **不** 在服务端做路径→fid 缓存）。
+
+| 侧 | 职责 |
+|----|------|
+| **客户端** | 维护 `path → fid`、`parent_fid` 等缓存；热路径用 **fid + name** 发 RPC，避免 `LOOKUP_PATH`。 |
+| **服务端** | 不存 PathCache；`lookup` / `LOOKUP_PATH` 每次按 `DirIndex` 解析。通过 RPC 响应携带 **`generation`**（目录或 inode 上的单调版本），供客户端判断缓存是否失效。 |
+
+**目录 `generation`**（§4.1 Inode）：在 `rename` / `rmdir` / `unlink` / 子树变更时递增；客户端比对 `(path, cached_fid, cached_generation)`，不一致则丢弃该路径缓存项并重新解析。
+
+**失效规则（客户端本地）**：
+
+- 单文件 `remove` / `rename`：失效该路径及已知子路径前缀（若有目录缓存树）。
+- 目录 `rmdir`：失效以该路径为前缀的全部缓存项。
+- 收到别客户端 mutating 成功且本地无 generation 时：可保守失效父目录缓存，或依赖后续 `LOOKUP` 失败再刷新。
+
+**不采用**：MetaServer 间复制 path cache、standby 只读副本提供缓存命中、或全局 `PathToFidCacheMap`（避免一致性、失效广播与内存占用问题）。
+
+### 4.4 BlockMap（与 namespace 分离）
+
+```text
+BlockMap : 按 fid 分片
+  fid → [ ChunkInfo { chunk_id, offset, version, locations, tier } ]
+```
+
+- **`create` 不写入 BlockMap**（与 HDFS 一致）。
+- **`allocate` / `append`** 才追加 chunk 记录；edit log 使用独立 op 类型。
+
+---
+
+## 5. 热路径算法
+
+### 5.1 CREATE（空文件）
+
+**前置**：客户端提供 `parent_fid` + `name`（已有 QFS `MetaCreate::dir`）。
+
+```text
+1. shard = hash(parent_fid) % N_SHARDS
+2. lock(DirShard[shard])
+3.   if DirIndex[parent].contains(name) → 处理 exclusive / truncate 语义
+4.   new_fid = FidAllocator.next()
+5.   DirTable[parent].insert(name, new_fid)   // §4.2，必要时 promotion
+6.   InodeTable[new_fid] = Inode{ parent, attrs... }
+7.   update parent.mtime, parent.file_count
+8.   txn = EditLog.append(CREATE, parent, name, new_fid, attrs, op_id)
+9. unlock
+10. if sync_policy == always: wait(txn.committed)
+11. return new_fid
+```
+
+**树操作次数**：0。持久化：1 条 edit（组提交时与其他 op 共享一次 fsync）。
+
+### 5.2 LOOKUP（单级）
+
+```text
+lock(DirShard[hash(parent)])
+  entry = DirTable[parent].lookup(name)
+  fa = InodeTable[entry.fid]
+unlock
+→ 权限检查
+```
+
+### 5.3 LOOKUP_PATH
+
+**服务端**无 PathCache：按 `/` 分段，**每段一次 DirIndex 查找**，最后一段做 access check。  
+**客户端**应先查本地 path 缓存（§4.3）；未命中再发 `LOOKUP_PATH` 或分段 `LOOKUP`（持 `parent_fid`）。
+
+### 5.4 READDIR
+
+```text
+readdir(parent, cookie, max_entries) → 分页返回 DirEntry 列表
+```
+
+| `DirNode.state` | 遍历方式 | cookie 概要 |
+|-----------------|----------|-------------|
+| **SMALL** | 桶序 + 桶内序 | 逻辑位置（§5.4.1） |
+| **PROMOTING** | 仍按 Small | 同 SMALL；`generation` 未变 |
+| **LARGE** | B+ 树 key 序（`kfstree`） | **逻辑 key 游标**，禁止节点指针 |
+
+- 每次 RPC 仅返回 **≤ max_entries**（默认上限如 1024，可配置）。
+- 禁止：一次 RPC 返回百万项；禁止 Large 布局下无序全表扫描。
+
+#### 5.4.1 Readdir Cookie 鲁棒性（已决）
+
+**问题**：若 Large 布局 cookie 编码 **`LeafIter` 内部物理状态**（`Node*`、叶内下标），则两次 `readdir` 之间对该目录的 **`insert`/`del` 导致 B+ 树 split/merge** 后，cookie 可能 **失效或指错位置**（重复、漏项）。目录已为 **LARGE** 时不会发生 promotion，但 **树重平衡仍会发生**。
+
+**决策**：cookie 表示 **逻辑遍历位置**，不绑定可变物理指针；对齐 HDFS「续传令牌 = 逻辑名 / 有序 key」思路。
+
+##### 硬性规则
+
+| 规则 | 说明 |
+|------|------|
+| **禁止** | cookie 中序列化 `Node*`、堆地址、`LeafIter` 内存指针 |
+| **必须** | 可由 `(parent_fid, generation, resume_key)` 在当前树上 **重新定位** |
+| **`generation` 不匹配** | 返回 `EINVAL` / 空 cookie 重启；客户端全量重扫该目录 |
+
+##### SMALL / PROMOTING
+
+```text
+CookieSmall = {
+  generation:   u64
+  layout:       SMALL | PROMOTING
+  bucket_id:    u32      // 开放寻址桶序号（稳定枚举顺序）
+  slot:         u32      // 桶内下一起始槽位
+}
+```
+
+- 仅在 **同一 `generation`、同一 Small 布局** 下有效；**promotion 完成** 后 `generation++`，旧 cookie **作废**（切换为 Large cookie 或从头）。
+
+##### LARGE（推荐：逻辑 key 游标）
+
+**首选**（实现简单、对 split/merge 最稳）：
+
+```text
+CookieLarge = {
+  generation:   u64
+  layout:       LARGE
+  after_hash:   u64          // 上一页最后一条的 name_hash
+  after_name:   bytes        // 上一页最后一条的文件名（字典序续扫）
+}
+```
+
+恢复算法：
+
+```text
+readdir_resume(parent, cookie):
+  if cookie.generation != DirNode.generation: INVALID
+  key = Key(KFS_DENTRY, parent, cookie.after_hash)
+  it = lowerBound(tree, key)                    // 现 kfstree
+  skip entries where (hash,name) <= cookie.after_name lexicographically
+  return next max_entries from it (LeafIter 仅作实现手段，不写入 cookie)
+```
+
+- B+ 树 **split/merge 不改变 key 的全序**；只要条目未被删除，续扫位置仍正确。
+- **并发 insert**：新名可能插在已扫过区间之前 → 客户端可能漏扫；与 HDFS 一致，**不保证** 遍历期间快照隔离；强一致列举需 **`generation` 冻结** 或 copy-on-read（**非首版**）。
+- **并发 delete**：已返回的名字可能已不存在；续扫 `lowerBound` 自然跳过。
+
+**可选**（与 HDFS 部分实现类似，需稳定叶 id）：
+
+```text
+CookieLargeAlt = { generation, leaf_node_id, index_in_leaf }
+```
+
+- `leaf_node_id` 为 **分配的稳定叶标识**（split 时子叶继承/拆分规则须在 RFC 实现细则中定义），**不是**运行时指针。
+- 恢复时若 `leaf_node_id` 已合并/分裂：**从该 id 映射节点的最小 key**，或 **`lowerBound(该 key)` 的下一个有效叶** 继续，**宁可少量重复不可漏**（与建议一致）。
+- 首版 **优先 `after_name` 游标**；`leaf_node_id` 方案可在性能优化阶段引入。
+
+##### 与 promotion / mutation 的交互
+
+| 事件 | cookie 行为 |
+|------|-------------|
+| **promotion 完成** | `generation++`；Small cookie **失效**；客户端用空 cookie 对 Large 重扫 |
+| **rename/unlink/rmdir（目录）** | `generation++`；所有 cookie 失效 |
+| **Large 上 create/delete** | `generation` 可不变；**`after_name` cookie 仍有效**（靠 key 重定位）；若产品要求列举快照视图，另议 |
+| **返回 `-EBADF`/`EINVAL`** | 客户端 **丢弃 cookie，从空重新开始** |
+
+##### RPC 响应
+
+- 每页返回：`entries[]`、`more_entries`、`next_cookie`（编码上述结构，版本号 `cookie_ver=1`）。
+- 不把 `LeafIter` 状态暴露给客户端。
+
+**验收（P2）**：在 Large 目录连续 `readdir` 分页过程中注入随机 `insert`/`del`，验证无指针 cookie 时 **无崩溃、无无限循环**；允许与 HDFS 相同的「并发修改下不保证严格快照列举」语义。
+
+---
+
+## 6. 持久化设计
+
+### 6.1 Edit Log 记录格式（概念）
+
+采用 **定长头 + 变长 payload** 的二进制编码（避免 QFS 部分文本 token 解析开销）：
+
+```text
+Record {
+  magic, version
+  txn_id:      u64      // 单调
+  op:          u16      // CREATE=1, REMOVE=2, MKDIR=3, ...
+  op_id:       u128     // 幂等键 (client_id, seq)
+  payload:     op-specific
+}
+checksum per block / per record
+```
+
+**CREATE payload 示例字段**：`parent_fid, name, new_fid, mode, uid, gid, replication, ...`
+
+### 6.2 组提交（Group Commit）
+
+| 模式 | 行为 |
+|------|------|
+| `batch`（默认） | 每 `commit_interval_ms` 或 `commit_batch_bytes` 一次 `fdatasync` |
+| `always` | 每个 txn 等待 fsync（兼容强一致测试） |
+| `none` | 仅写 page cache，崩溃可能丢最近操作（需明确禁用场景） |
+
+**Log 线程模型**：单写者 append + fsync；namespace 分片锁与 log 锁分离，缩短临界区。
+
+### 6.3 用户快照（已决：HDFS 式引用 + 文件级 COW）
+
+**放置说明**：本节描述 **Snapshottable 目录上的用户可见快照**（类比 HDFS `createSnapshot`），与 §6.4 **周期性 Checkpoint/FSImage**（NN 冷备）分工不同。实现可落在 **P3/P3.1**（§9）。
+
+**决策：创建快照采用 HDFS 核心思路——引用（Rename/Reference）而非复制；修改采用文件级写时复制（COW）。** 不采用对整棵树做全量内存扫描来「创建」用户快照（该做法保留给 §6.4 Checkpoint）。
+
+#### 6.3.1 核心机制（对齐 HDFS）
+
+| HDFS 概念 | 本 RFC 映射 |
+|-----------|-------------|
+| `INodeReference` | **`InodeRef`**：快照目录上的一个轻量引用，指向某 **`fid`（目录或文件根）** 在 `committed_txn_id = N` 时的逻辑视图 |
+| 创建快照 O(1) | 在 snapshottable 目录 `D` 上新增 `Snapshot{s_id}` → 仅增加 **Ref → D 的 inode/目录状态**，**不**复制百万子项 |
+| 读快照 | 沿 Ref 解析路径；**无锁读**（读快照侧为只读视图） |
+| 首次修改被快照覆盖的文件 | **文件级 COW**：保留旧 `Inode`+`BlockMap` 给快照；活动命名空间新建 `Inode`（新 fid 或新 inode 行）并更新 **该名字** 在 `DirTable` 中的映射 |
+| 修改目录下其他未触碰文件 | **零开销**（Ref 仍指向原 inode；活动 DirTable 不变） |
+
+```text
+allowSnapshot(dir_fid)     // 标记目录可打快照（类比 snapshottable）
+createSnapshot(dir, name)  // 例如 /foo → s1
+  → SnapshotRecord { id, parent_snap, root_ref → inode@N }   // O(1)，无子树复制
+
+// 用户 delete / truncate / 覆盖写 / rename 活动树中的 file1，且 file1 在 s1 覆盖下：
+mutate(file1):
+  if inode_snapshotted(file1):
+    cow_inode(file1):
+      frozen = clone_inode_shallow(file1)   // 快照保留
+      live   = new_inode_for_mutation()     // 活动命名空间
+      DirTable[parent].replace_name(file1 → live)
+      append EditLog(COW_SPLIT, ...)
+  else:
+    normal_mutate(file1)
+```
+
+- **目录级百万文件**：创建 `s1` **不遍历** `DirTable`；仅在被修改的单个文件上支付 COW（约一次 create + 后续 write 的元数据开销）。
+- **Large 目录**：COW 只 **`replace_name` 一条 DirEntry**（Small 或 `kfstree` 单键更新），不重扫整棵 per-dir `Tree`。
+
+#### 6.3.2 性能预期（与 HDFS 对照）
+
+| 场景 | 性能 | 原因 |
+|------|------|------|
+| 读活动/读快照文件 | 快照读无额外锁；活动读与无快照相同 | Ref 只读解析 |
+| 创建 / 删除快照 | **近似 O(1)** | 仅增删 `SnapshotRecord` / `InodeRef` |
+| 首次修改快照覆盖下的文件 | 有开销（COW 一个 inode） | 与被修改文件数成正比，与目录总规模无关 |
+| 再次修改已 COW 过的活动文件 | 与无快照相同 | 已操作活动侧新 inode |
+
+#### 6.3.3 数据结构（与 §4.1 衔接）
+
+```text
+SnapshotRecord {
+  snap_id, name, root_dir_fid, txn_id_at_create: N
+  root_ref: InodeRef              // O(1) 创建：指向 snapshottable 根目录 inode
+  cow_inodes:  set<fid_t>         // 可选：本快照触发的 frozen fid 登记，便于 delete 时递减
+}
+
+InodeRef { target_fid, txn_id_cap }
+```
+
+- **`snap_ref_count`** 定义在 §4.1 `Inode` 上：表示有多少 **独立快照引用** 仍依赖该 **inode 对象**（通常为 COW 后的 **frozen** 副本；活动/live inode 在分裂后一般为 0）。
+- Edit log：`SNAPSHOT_CREATE`、`SNAPSHOT_DELETE`、`INODE_COW_SPLIT`（含 `frozen_fid`、`live_fid`、`snap_ref_delta`），供 standby **确定性 replay**。
+
+#### 6.3.6 Frozen inode 引用计数（已决）
+
+**问题**：§6.3 删除快照时「仅回收本快照专属的 frozen inode」。若同一 frozen inode 被 **多个快照** 引用（例如 `/foo` 上连续创建 `s1`、`s2` 后才首次修改 `file1`），**不能在 `snap_ref_count > 0` 时释放**。
+
+**决策**：在 `Inode` 上维护 **`snap_ref_count`**；在 **COW 分裂** 与 **删除快照** 时严格增减；减到 **0** 才可回收该 frozen inode（及对应 `BlockMap`）。
+
+##### 何时增减（与 HDFS 文件级 COW 对齐）
+
+| 事件 | `snap_ref_count` | 说明 |
+|------|------------------|------|
+| **`createSnapshot`** | 根目录 `root_ref.target` **+1**（可选） | 创建本身 O(1)；**不**遍历子树给每个文件 +1。未 COW 的文件仍与 live 共用同一 `fid`，读快照走解析路径。 |
+| **首次 `COW_SPLIT`（file1）** | 对 **frozen_fid**（旧 inode 副本）设为 **覆盖该文件的所有活跃快照数** `K` | 例：存在 `s1`、`s2` 均可见 `file1` 时尚未修改 → `frozen.snap_ref_count = 2`。活动侧新 `live_fid`：`snap_ref_count = 0`。 |
+| **再建快照 `s3`（已有 frozen file1）** | 若 `s3` 仍指向含 `file1` 的视图且 `file1` 已 frozen：对 `frozen_fid` **+1** | 仅影响 **已分裂** 的 frozen 对象；仍与 live 共用的路径在首次 COW 时一次性结算。 |
+| **`deleteSnapshot(s)`** | 对该快照登记过的每个 `frozen_fid`：**-1** | 来自 `cow_inodes` 或快照元数据索引；**仅当减到 0** 时 `free_inode(frozen_fid)` + 释放 BlockMap |
+| **活动路径修改 live inode** | 不增减 | live 与快照引用解耦 |
+
+```text
+cow_split(file_fid, parent, name):
+  frozen_fid = retain_or_clone_inode(file_fid)   // 旧版本留给快照
+  live_fid   = allocate_new_inode(...)
+  frozen.snap_ref_count = count_snapshots_covering(parent, name, frozen_fid)
+  DirTable[parent].replace_name(name, live_fid)
+  for each snap covering this path:
+    snap.cow_inodes.insert(frozen_fid)
+  append EditLog(INODE_COW_SPLIT, frozen_fid, live_fid, snap_ref_count, ...)
+
+deleteSnapshot(snap_id):
+  for fid in snap.cow_inodes:
+    if (--InodeTable[fid].snap_ref_count == 0)
+      free_inode_and_blockmap(fid)
+  remove SnapshotRecord
+  append EditLog(SNAPSHOT_DELETE, snap_id, ...)
+```
+
+##### 回收规则（「仅属于该快照」的精确定义）
+
+- **可回收**：`snap_ref_count` 在 `deleteSnapshot` 后变为 **0** 的 inode（表示 **没有任何** 快照再引用该 frozen 版本）。
+- **不可回收**：`snap_ref_count > 0`——即使本次删除的 `s_i` 不再引用，只要还有 `s_j` 引用同一 frozen 副本，就必须保留。
+- **活动 inode**：`snap_ref_count == 0` 为常态；删除快照 **永不** 直接 `free` 当前 live `fid`（除非该 `fid` 本身也是某次 COW 的 frozen 且计数归零）。
+
+##### 正确性验证（实现必须覆盖）
+
+| 检查点 | 要求 |
+|--------|------|
+| **无双重释放** | 仅当 `snap_ref_count == 0` 入 free 队列；delete/replay 幂等 |
+| **无泄漏** | 删除最后一个持有引用快照后，frozen 必入 free；fsck 扫描 `snap_ref_count==0` 且 unreachable |
+| **Replay** | `INODE_COW_SPLIT` / `SNAPSHOT_DELETE` 重放后计数与主路径一致 |
+| **并发** | COW 与 `deleteSnapshot` 在同 `fid` 或 snap 元数据锁下串行化计数更新 |
+| **循环引用** | 命名空间为 **DAG**（父指针单父目录）；`InodeRef` 仅 **快照元数据 → inode**，inode **不** 指回 `SnapshotRecord`，图 **无环**。无需通用循环引用检测，但需在 code review / 单元测试中 **断言** 不建立 inode→snapshot 反向边 |
+
+##### fsck（§8.5 扩展）
+
+- 对每个 `snap_ref_count > 0` 的 inode：存在至少一条 `SnapshotRecord` / `cow_inodes` 反向引用。
+- 对每个 `SnapshotRecord.cow_inodes` 中的 `fid`：`snap_ref_count >= 1`。
+- 删除快照后的 spot check：`cow_inodes` 中不应出现已 free 的 `fid`。
+
+**成熟度说明**：引用计数为业界成熟手段（HDFS snapshot diff、 btrfs 等同类问题），但本实现须在 **COW 分裂计数初值**、**多快照叠加**、**delete + replay** 三条路径上做 **专项测试**（属性测试或模拟并发删除），列入 **P3.1 验收**。
+
+#### 6.3.4 与 §6.4 Checkpoint 的边界
+
+| | §6.3 用户快照 | §6.4 Checkpoint/FSImage |
+|--|----------------|-------------------------|
+| 目的 | 时间点恢复、误删回滚、对比历史 | MetaServer **重启/冷备**、缩短 replay |
+| 创建成本 | **O(1)** per snap | O(namespace) 后台扫描（可模糊） |
+| 读路径 | 快照视图 | 正常命名空间 |
+| 存储 | 内存 Ref + 被 COW 分离的 inode | 磁盘 FSImage 文件 |
+
+两者可同时存在：HDFS 亦区分 **Snapshot** 与 **Checkpoint（FSImage）**。
+
+#### 6.3.5 未采纳为用户快照的方案
+
+| 方案 | 结论 |
+|------|------|
+| 一致性点 + 全量遍历生成用户快照 | **否**；移至 §6.4，仅用于 Checkpoint |
+| 全局 freeze 瞬时快照 | 阻塞写，不利于 CREATE 目标 |
+| 目录级深拷贝百万子项 | O(N) 创建，不可接受 |
+
+### 6.4 Checkpoint / FSImage（已决：一致性点 + 后台遍历）
+
+**用途**：周期性 **MetaServer 冷备与启动加速**（类比 HDFS FSImage + edits），**不是** §6.3 的用户快照。
+
+**决策**：采用 **一致性点 `N = committed_txn_id` + 后台遍历** 写出模糊 FSImage；恢复时 `load FSImage(N)` + `replay(txn_id > N)`。与 §6.3 HDFS 式快照 **正交**。
+
+#### 6.4.1 流程
+
+```text
+triggerCheckpoint()   // 周期或 MetaCheckpoint RPC
+  ├─ 记录 LAST_TXN_ID = committed_txn_id  （N）
+  ├─ 后台线程遍历 InodeTable、DirTable（§4）、BlockMap（可选）
+  │     允许与写并发；图像可「模糊」
+  ├─ 写出 FSImage + footer(N)
+  └─ 原子 publish
+
+冷启动：load FSImage(N) → replay Edit Log (txn_id > N) → 一致
+```
+
+正确性：依赖 §6.7 之「先 Edit Log 再内存 / committed 边界」；模糊项由 replay 修正（同原 §6.3.2 论证）。
+
+#### 6.4.2 FSImage 内容与 Large 目录
+
+- `section_inodes`、`section_dirs`（Small 桶或嵌入 **`kfstree` checkpoint 流**）。
+- log 截断：**可选**运维操作，非恢复前提。
+
+#### 6.4.3 代价
+
+快照扫描慢 → `txn_id > N` 的 log 段变长 → **重启 replay 变长**；需控制 checkpoint 周期（配置 `meta.checkpoint.interval` 等）。
+
+### 6.5 与 QFS LogWriter / VR 的关系
+
+- **可复用**：quorum 复制、block 切分、primary lease、`MetaVrLogSeq` 序语义。
+- **需替换**：`WriteLog` 序列化内容与 replay 解析器（`Replay.cc` / `replay_create` 文本格式 → 二进制 op）。
+- **不再依赖**：`metatree.insert` 作为 redo 单元；redo 单元为 **edit op**。
+
+### 6.6 内存修改与 log 的顺序（相对 QFS 的关键改进）
+
+**提议默认顺序**：
+
+```text
+（分片锁内）改内存 → append 到 log 内存 buffer → 释放锁
+（log 线程）buffer → 复制 → fsync → 推进 committed_txn_id
+```
+
+对比 QFS：**先 log committed 再 `handle()`**，客户端等待包含「空窗期」内无法从内存读到结果的双重延迟。本 RFC 的可见性边界见 **§6.7**：其他客户端以 **已提交命名空间** 为准；发起方在 RPC 成功后的可见范围与 **lease / sync 策略** 对齐 HDFS 习惯，而非「未提交 txn 全网可见」。
+
+### 6.7 读一致性（已决）
+
+**决策：采用 (c) 跟随 HDFS 风格的 lease + 已提交命名空间模型**，并与 QFS 现有 **primary / VR / chunk lease** 语义衔接（`LEASE_ACQUIRE`、`LEASE_RENEW` 等，见 `MetaRequest`）。
+
+| 场景 | 规则 |
+|------|------|
+| **命名空间变更**（CREATE / REMOVE / RENAME …） | 对其他客户端：仅在 edit **已 committed**（`committed_txn_id` 推进、quorum 复制完成）后可见；primary 内存中未 fsync 的 buffer **不**对外暴露。 |
+| **RPC 返回与 durable** | `sync=always`：成功返回 ≡ 命名空间变更已 durable，他客户端可见（在 primary 正常服务前提下）。`sync=batch`：返回表示 **已接受**；他客户端可见时点不早于本批 **组提交 fsync**（类比 HDFS edit 组提交窗口）。 |
+| **文件数据读写** | 命名空间登记（create 得 fid）与 **写数据** 分离；已打开文件的读写一致性由 **chunk lease** 保证写者独占/租约续期，读者看到已提交块版本，与 HDFS 「NN 管名字、DN 管块 + lease」分工一致。 |
+| **Primary / standby** | 仅 primary 执行 namespace 变更并写 edit；standby 通过 log replay 追赶；客户端 mutating 与强一致命名空间读面向 primary（与现 VR 一致）。 |
+
+**不采用**：
+
+- **(a) 仅 primary 本地可见未提交变更**：不足以定义多客户端语义，且与 backup 复制模型冲突。
+- **(b) 未提交 txn 全网可见**：破坏恢复与 fsck 假设，并引入跨客户端脏读。
+
+**实现提示**：可在 `Inode` 或目录上保留 `last_committed_txn`；`lookup` / `readdir` 仅暴露 `txn_id ≤ committed_txn_id` 的视图；写路径 lease 逻辑复用现有 QFS 实现，本层不新增第二套租约协议。
+
+---
+
+## 7. 并发模型
+
+### 7.1 锁层次
+
+| 资源 | 锁粒度 |
+|------|--------|
+| `DirIndex` | `hash(parent_fid) % N` 分片锁：**读锁**（lookup/readdir，`PROMOTING` 仍读 Small）；**写锁**（create/promotion，写者等待 `PROMOTING` 结束） |
+| `InodeTable` | `hash(fid) % M` 分片；读多写少用 RW lock |
+| `FidAllocator` | 无锁原子或独立 mutex |
+| `EditLog buffer` | 单写者 + MPSC 队列 |
+| `PathCache` | RCU 或 per-shard 锁 |
+
+**禁止**：所有 mutating RPC 共用一个 `submit_request` 全局 mutex（现状瓶颈）。
+
+### 7.2 与 B+ 树分片锁的区别
+
+对 **全局 `metatree`（单例 B+ 树）**，「按 parent 加锁」**不安全**（不同目录可能 split 同一内部节点，见 `MetaTree-Lock-Optimization.md`）。  
+对 **DirTable 分片**：按 `parent_fid` 加锁 **安全**——Small 为独立 `flat_hash`；Large 为 **该目录专属 `Tree` 实例**（仍用 `kfstree`，但不与别目录共享内部节点）。
+
+---
+
+## 8. RPC 与客户端约定
+
+### 8.1 保持兼容的字段
+
+- `CREATE`：`P`（parent fid）、`N`（name）、`R`（replicas）等现有 QFS 头。
+- 响应：`H`（新 fid）不变。
+
+### 8.2 推荐客户端行为
+
+1. **路径与父 fid 缓存（必选）**：在客户端维护 `path → { fid, generation }` 与 `parent_fid`；mutating 成功后更新或按 §4.3 失效；**不要依赖** MetaServer 路径缓存。
+2. **批量 create**：`MULTI_CREATE` 一次 RPC 多条，log 一条 batch op 或连续 append 一次 fsync。
+3. **幂等**：携带 `r`（reqId）；服务端 LRU 表 `op_id → result`（TTL 秒级）。
+4. **响应字段**：`LOOKUP` / `CREATE` / `READDIR` 等返回目录 `generation`（或等价 epoch），供客户端校验本地 cache。
+
+### 8.3 服务端可删减的 create 工作
+
+将 **striping / tier / object-store 判定** 延后到 `SETATTR` 或 **首次 `ALLOCATE`**，使 `CREATE` 保持最小临界区（可选配置开关兼容旧语义）。
+
+### 8.4 特殊路径：WORM、dumpster、虚拟 `/proc`（已决）
+
+对齐现 QFS（`gWormMode`、`DUMPSTERDIR`、`/proc/invalid_chunks`），在 **DirIndex + InodeTable** 模型下的规则如下。
+
+#### WORM
+
+| 项 | 规则 |
+|----|------|
+| 开关 | 全局 `worm_mode`（等价 `TOGGLE_WORM` RPC），与现网一致。 |
+| 拦截层 | **RPC / op 分发层** 统一校验：在 `worm_mode` 下，`REMOVE` / `RENAME` / 覆盖写等 mutating 若目标路径或文件名不满足 `IsWormMutationAllowed`，返回 `-EPERM`。 |
+| DirIndex | **不**为 WORM 单独建索引类型；普通 `DirIndex` 操作不变。 |
+| 持久化 | `worm_mode` 写入 edit / snapshot 元数据段（或专用 op），恢复后恢复开关状态。 |
+
+#### dumpster（`/dumpster`）
+
+| 项 | 规则 |
+|----|------|
+| 形态 | 根目录下 **普通目录**，启动时 `MKDIR(ROOT, "dumpster")` 得到固定 `dumpster_fid`；在 `Inode.flags` 标记 **`INODE_FLAG_DUMPSTER_ROOT`**（仅根下该目录）。 |
+| 用途 | `remove(..., todumpster=true)` 语义为 **rename 到 `dumpster_fid` 下**（与现 `kfsops` 一致），不是额外隐藏表。 |
+| 限制（RPC） | 禁止在 dumpster 内 **create/mkdir**；禁止任意 rename **进入或离开** dumpster（`mEnforceDumpsterRulesFlag` 等价配置）；禁止删除 dumpster 目录本身。 |
+| DirIndex | 与普通目录相同：`DirIndex[dumpster_fid]` 存待清理文件；后台任务对非 busy 文件再 `remove`。 |
+
+#### `/proc/invalid_chunks`（虚拟路径）
+
+| 项 | 规则 |
+|----|------|
+| 形态 | **不进入 `DirIndex`**；无真实 `proc` 目录项。 |
+| 解析 | RPC 层（如 `CREATE` / `LOOKUP_PATH` 入口）识别前缀 `/proc/invalid_chunks/`，解析 `chunkId` 后直查 **`BlockMap` / chunk 元数据**，用于诊断日志（对齐现 `MetaCreate::start` 中 `invalChunkFlag` 分支）。 |
+| 客户端 | 不应缓存该路径为普通目录；不分配长期 fid。 |
+
+### 8.5 fsck（已决）
+
+**不再遍历 B+ 树叶子**；单机全量检查按以下顺序（可 fork 后台进程，对齐现 `MetaFsck` 工具链）：
+
+```text
+Phase A — InodeTable
+  对每个 fid：
+    - 类型合法；parent_fid 存在（或为 ROOT）
+    - 若 type=file：BlockMap 条目可选校验（chunk 副本、版本，委托 LayoutManager 逻辑）
+    - 标记 abandoned / 零长度策略（沿用现 fsck 配置项）
+
+Phase B — DirTable（每个目录 fid）
+  按 DirNode.layout 枚举：
+    - **SMALL**：flat_hash 全桶扫描，校验无重复 NameKey、探测链有界
+    - **LARGE**：遍历该目录专属 `Tree` 叶（同现 `kfstree` 迭代），校验 `Key(KFS_DENTRY, parent, hash)` 与 name 唯一
+  对每条 DirEntry (name → child_fid)：
+    - InodeTable[child_fid] 存在且 parent_fid == 当前目录 fid
+  对 InodeTable 中 type=dir 的项：
+    - 必须存在 DirTable[dir_fid]；`child_count` 与枚举数量一致
+  - `state` 不得持久化为 **PROMOTING**；若 checkpoint 遇到则按 §4.2.6 修复或拒绝加载
+  - 若 `child_count >= dir_large_threshold` 则 `state` 应为 **LARGE**
+
+Phase C — 双向一致
+  - 无「仅在 DirIndex 出现、Inode 无 parent」的孤儿
+  - 无「Inode 有 parent 但父目录 DirIndex 无对应 name」的悬空项
+  - dumpster 子项：仅允许 file 类型条目（可选策略检查）
+
+Phase D — 与 edit committed 视图一致（可选在线 fsck）
+  仅扫描 txn_id ≤ committed_txn_id 的视图（§6.7）
+
+Phase E — 用户快照引用计数（§6.3.6）
+  - 对每个 SnapshotRecord：cow_inodes 中 fid 存在且 snap_ref_count >= 1
+  - 对每个 snap_ref_count > 0 的 inode：至少被一个 SnapshotRecord.cow_inodes 引用
+  - 无 snap_ref_count == 0 且仅被快照元数据悬挂的 unreachable frozen
+```
+
+报告格式可继续兼容现 `MetaFsck` / `kfsfsck` 客户端字段；内部扫描源从 `metatree` 迭代改为 **InodeTable + DirIndex 枚举**。
+
+---
+
+## 9. 分阶段实施路线图
+
+| 阶段 | 交付 | CREATE 预期收益 |
+|------|------|-----------------|
+| **P0** | 文档 + 基准：分解 QFS create = queue / fsync / btree / mutex | 基线数据 |
+| **P1** | Log 组提交 + 缩小 `submit_request` 锁；客户端强制 parent fid | 中（不改索引） |
+| **P2** | `DirTable`（§4.2 Small+Large+promotion）+ `InodeTable`；百万级单目录基准 | 高 |
+| **P2.1** | §8.4 特殊路径 + §8.5 fsck（含两种 DirNode layout） | 可运维 |
+| **P3** | v2 edit + §6.4 Checkpoint（FSImage N + replay）+ 冷启动闭环 | 很高 |
+| **P3.1** | §6.3 用户快照 + §6.3.6 `snap_ref_count`（COW/delete/replay/fsck 测试） | 可回滚目录 |
+
+（**范围外**：多 MetaServer namespace 分片、BlockMap 独立服务、inode 换出等，不列入本 RFC 路线图。）
+
+---
+
+## 10. 开放问题
+
+本 RFC 范围内 **无剩余开放项**。已决事项索引：
+
+| 主题 | 章节 |
+|------|------|
+| 读一致性 | §6.7 |
+| PathCache | §4.3 |
+| 单机内存范围 | §2.1 / §2.2 |
+| WORM / dumpster / `/proc/invalid_chunks` | §8.4 |
+| fsck | §8.5 |
+| 大目录索引（Small/Large + promotion） | §4.2 |
+| Promotion 原子性与可见性 | §4.2.6 |
+| Readdir cookie 逻辑位置 | §5.4.1 |
+| 用户快照（HDFS 式 Ref + 文件级 COW） | §6.3 |
+| Frozen inode `snap_ref_count` | §6.3.6 |
+| Checkpoint/FSImage（一致性点 + 后台遍历） | §6.4 |
+
+后续若扩展 **多机分片、inode 换出**，另起 RFC。
+
+---
+
+## 11. 备选方案（已否决或延后）
+
+| 方案 | 结论 |
+|------|------|
+| 保留全局 B+ 树，仅优化锁 | 无法消除双 insert 与树分裂；并发上限低（见 `MetaTree-Lock-Optimization.md`） |
+| 仅全局 B+ 树 | 已否决；见 §4.2.5 |
+| 单目录百万项仍用平铺 HashMap+链表 | **已否决**；首版必须 Large 布局 + promotion |
+| 每目录一棵 B+ 树（Large 布局） | **已采纳**，**复用 `kfstree`**，仅用于 `child_count ≥ threshold` 的目录 |
+| 自研另一套目录 B-tree 实现 | **已否决**，与现网重复且难保持 checkpoint 一致 |
+| 纯 tmpfs、无持久化 | 不符合 QFS 定位 |
+| 完全照搬 RocksDB/LSM 存 namespace | 写放大与 create 延迟不如 hash + edit log 直接 |
+
+---
+
+## 12. 成功指标（建议验收）
+
+在相同硬件与 `sync=batch`（如 1ms 组提交）下，相对当前 QFS main：
+
+| 指标 | 目标（示例，需基准标定） |
+|------|--------------------------|
+| 空文件 create p50 | 降低 ≥ 50% |
+| 空文件 create p99 | 降低 ≥ 40%（/fsync 尾延迟） |
+| create QPS（单 meta，多客户端线程） | 提升 ≥ 3×，且随线程数近线性至磁盘/log 瓶颈 |
+| 单目录 10⁶ 子项 lookup p99 | < 50µs 量级（Large 布局，无长链；以基准为准） |
+| 单目录 10⁶ 子项 readdir（每页 1k） | 稳定延迟，不随 N 线性恶化 |
+| 恢复时间 | `FSImage(N)` + `replay(txn>N)`（§6.4）；与用户快照创建 O(1)（§6.3）无关 |
+| 创建用户快照 | O(1)，与目录子项数无关（§6.3） |
+
+---
+
+## 13. 参考文献（仓库内）
+
+- `src/cc/meta/kfstree.h` / `kfstree.cc` — B+ 树（Large 目录 **复用** 本实现；全局 `metatree` 不再用于 namespace dentry）
+- `src/cc/meta/kfsops.cc` — `Tree::create` / `link` 双 `insert`
+- `src/cc/meta/MetaRequest.cc` — `MetaCreate::start` / `handle`，`SubmitBegin`
+- `src/cc/meta/LogWriter.cc` — `Enqueue`、`WriteLog`、`fsync`
+- `src/cc/meta/NetDispatch.cc` — `submit_request` 全局串行注释
+- `src/cc/meta/MetaRequest.cc` — `gWormMode`、`/proc/invalid_chunks`、`MetaFsck`
+- `src/cc/meta/kfsops.cc` — `DUMPSTERDIR`、dumpster rename/remove 规则
+- `MetaTree-Lock-Optimization.md` — B+ 树分片锁不安全分析
+- `wiki/Performance-Comparison-to-HDFS.md` — 历史 metaserver 对比背景
+
+---
+
+## 修订历史
+
+| 版本 | 日期 | 说明 |
+|------|------|------|
+| 0.1 | 2026-05-25 | 初稿：绿场内存原生元数据层，对照 QFS/HDFS |
+| 0.2 | 2026-05-25 | 移除 § 迁移与兼容；明确绿场/冷启动范围，路线图去掉迁移工具 |
+| 0.3 | 2026-05-25 | 读一致性决策：§6.6 采用 HDFS 风格 lease + 已提交命名空间 |
+| 0.4 | 2026-05-25 | PathCache 决策：§4.3 仅客户端缓存，MetaServer 不维护/复制路径 cache |
+| 0.5 | 2026-05-25 | 范围限定单机内存；去掉 namespace 水平分片/换出开放项与 P4 路线图 |
+| 0.6 | 2026-05-25 | §8.4 特殊路径、§8.5 fsck 已决；§10 无剩余开放项 |
+| 0.7 | 2026-05-25 | §4.2 大目录首版必做：Small flat_hash + Large 每目录 B+ 树 + promotion |
+| 0.8 | 2026-05-25 | Large 布局明确复用现 `kfstree`（`Tree`/`Node`/`Key`/`MetaDentry`），不新写 B-tree |
+| 0.9 | 2026-05-25 | §6.3 已决：一致性点 + 后台模糊 FSImage + replay(txn>N) |
+| 1.0 | 2026-05-25 | §6.3 改为 HDFS 式用户快照（InodeRef+文件级COW）；§6.4 为 Checkpoint/FSImage |
+| 1.1 | 2026-05-25 | §4.2.6 Promotion：`PROMOTING` 状态、staging、读 Small/写等待、原子发布 |
+| 1.2 | 2026-05-25 | §6.3.6：`snap_ref_count`、COW/删快照维护、fsck 与无环不变量 |
+| 1.3 | 2026-05-25 | §5.4.1：readdir cookie 用逻辑 key 游标，禁止 LeafIter/节点指针 |

From 469b735db745c9758e157950ae84fd426a20bb1b Mon Sep 17 00:00:00 2001
From: zhangzhibiao <zhangzhibiao@bigo.sg>
Date: Mon, 25 May 2026 18:08:50 +0800
Subject: [PATCH 5/7] modify

---
 .../RFC-0001-memory-native-metadata-layer.md  | 68 +++++++++++++++----
 1 file changed, 56 insertions(+), 12 deletions(-)

diff --git a/docs/rfc/RFC-0001-memory-native-metadata-layer.md b/docs/rfc/RFC-0001-memory-native-metadata-layer.md
index 6074bfe2c..2d86a134f 100644
--- a/docs/rfc/RFC-0001-memory-native-metadata-layer.md
+++ b/docs/rfc/RFC-0001-memory-native-metadata-layer.md
@@ -278,6 +278,16 @@ promote_small_to_large(parent_fid):
 
 **不采用**：晋升过程中对活动 RPC 暴露「部分迁移」的 Large；不采用无 `PROMOTING` 标记、原地边建树边切换 `layout` 字段。
 
+##### 晋升期间的读性能与写者饥饿（已决）
+
+- **读者**：`PROMOTING` 期间仍持目录 **读锁** 访问 `body.small`，可与其它 `lookup`/`readdir` **并发**；晋升线程 **只读遍历** Small，不修改 Small。
+- **风险**：触发晋升时 Small 可能已接近阈值（如 **数千～4096** 项），步骤 4 的 `insert` 循环耗时可 **阻塞同目录所有写者**（`create`/`unlink` 等等待 `promote_cv`），极端情况下造成 **写者饥饿**。
+- **决策**：单次 `promote_small_to_large` 须有 **墙上时钟上限**（默认 **`meta.dir.promoteMaxWallMs = 1000`**，可配置）：
+  - 在循环中 **分批** `insert`（如每批 256/512 项）并检查超时；
+  - **未超时**：正常完成步骤 5–7；
+  - **超时**：中止本轮晋升 → **回滚 SMALL**（§失败与恢复），返回 `-EBUSY` / 可重试错误；**不**半发布 Large；客户端/写路径 **退避重试** 或稍后由下一次 `create` 再次触发。
+- **观测**：对 `promote_wall_ms`、`promote_aborted_timeout` 打点；P2 验收：4096 项目录晋升 p99 墙钟 **≤ 配置上限**。
+
 ##### 失败与恢复
 
 - 若步骤 4 失败：`state` 回滚 **SMALL**，丢弃 `staging`，`generation` 不变，唤醒等待者并返回错误。
@@ -418,19 +428,21 @@ CookieSmall = {
 CookieLarge = {
   generation:   u64
   layout:       LARGE
-  after_hash:   u64          // 上一页最后一条的 name_hash
-  after_name:   bytes        // 上一页最后一条的文件名（字典序续扫）
+  last_key:     NameKey      // 上一页最后一条的完整排序键 (name_hash, name)
 }
+// 字段名 after_hash/after_name 仅作实现别名，语义上必须是 NameKey 二元组
 ```
 
+- **排序键**：与 §4.2.1 `NameKey` 一致；`kfstree` 叶序为 **先 `name_hash` 再 `name` 字典序**（同 `MetaDentry::matchSelf`）。单目录内 **不可能** 存在两个相同 `name`，但续扫仍须用 **`(hash, name)` 对**，不能仅用 `name`（不同 hash 桶下仅比 name 会错位）。
+- **禁止**：cookie 仅编码 `name` 字符串而省略 `name_hash`。
+
 恢复算法：
 
 ```text
 readdir_resume(parent, cookie):
   if cookie.generation != DirNode.generation: INVALID
-  key = Key(KFS_DENTRY, parent, cookie.after_hash)
-  it = lowerBound(tree, key)                    // 现 kfstree
-  skip entries where (hash,name) <= cookie.after_name lexicographically
+  it = lowerBound(tree, Key(KFS_DENTRY, parent, cookie.last_key.name_hash))
+  skip entries where NameKey(hash,name) <= cookie.last_key lexicographically
   return next max_entries from it (LeafIter 仅作实现手段，不写入 cookie)
 ```
 
@@ -446,7 +458,7 @@ CookieLargeAlt = { generation, leaf_node_id, index_in_leaf }
 
 - `leaf_node_id` 为 **分配的稳定叶标识**（split 时子叶继承/拆分规则须在 RFC 实现细则中定义），**不是**运行时指针。
 - 恢复时若 `leaf_node_id` 已合并/分裂：**从该 id 映射节点的最小 key**，或 **`lowerBound(该 key)` 的下一个有效叶** 继续，**宁可少量重复不可漏**（与建议一致）。
-- 首版 **优先 `after_name` 游标**；`leaf_node_id` 方案可在性能优化阶段引入。
+- 首版 **优先 `last_key`（NameKey）游标**；`leaf_node_id` 方案可在性能优化阶段引入。
 
 ##### 与 promotion / mutation 的交互
 
@@ -454,7 +466,7 @@ CookieLargeAlt = { generation, leaf_node_id, index_in_leaf }
 |------|-------------|
 | **promotion 完成** | `generation++`；Small cookie **失效**；客户端用空 cookie 对 Large 重扫 |
 | **rename/unlink/rmdir（目录）** | `generation++`；所有 cookie 失效 |
-| **Large 上 create/delete** | `generation` 可不变；**`after_name` cookie 仍有效**（靠 key 重定位）；若产品要求列举快照视图，另议 |
+| **Large 上 create/delete** | `generation` 可不变；**`last_key` cookie 仍有效**（靠 `NameKey` 重定位）；若产品要求列举快照视图，另议 |
 | **返回 `-EBADF`/`EINVAL`** | 客户端 **丢弃 cookie，从空重新开始** |
 
 ##### RPC 响应
@@ -575,9 +587,9 @@ InodeRef { target_fid, txn_id_cap }
 cow_split(file_fid, parent, name):
   frozen_fid = retain_or_clone_inode(file_fid)   // 旧版本留给快照
   live_fid   = allocate_new_inode(...)
-  frozen.snap_ref_count = count_snapshots_covering(parent, name, frozen_fid)
+  frozen.snap_ref_count = snapshot_ref_index.count(frozen_fid)   // 见下，禁止仅靠运行时全表扫描
   DirTable[parent].replace_name(name, live_fid)
-  for each snap covering this path:
+  for each snap in snapshot_ref_index.ref_snapshots(frozen_fid):
     snap.cow_inodes.insert(frozen_fid)
   append EditLog(INODE_COW_SPLIT, frozen_fid, live_fid, snap_ref_count, ...)
 
@@ -611,7 +623,22 @@ deleteSnapshot(snap_id):
 - 对每个 `SnapshotRecord.cow_inodes` 中的 `fid`：`snap_ref_count >= 1`。
 - 删除快照后的 spot check：`cow_inodes` 中不应出现已 free 的 `fid`。
 
-**成熟度说明**：引用计数为业界成熟手段（HDFS snapshot diff、 btrfs 等同类问题），但本实现须在 **COW 分裂计数初值**、**多快照叠加**、**delete + replay** 三条路径上做 **专项测试**（属性测试或模拟并发删除），列入 **P3.1 验收**。
+##### `count_snapshots_covering` 与倒排索引（已决）
+
+**问题**：`frozen.snap_ref_count = count_snapshots_covering(...)` 若在 COW 时 **扫描全部 SnapshotRecord** 或沿路径动态枚举，易错且 O(快照数)；多快照引用同一 frozen inode 时 **跨快照累计** 必须精确。
+
+**决策**（二选一，首版至少实现其一）：
+
+| 方案 | 做法 |
+|------|------|
+| **A. 倒排索引（推荐）** | 维护 `SnapshotRefIndex: frozen_fid → { snap_id... }`（及可选 `(parent,name) → frozen_fid`）。`createSnapshot`：对仍与 live 共用的路径 **不** 预遍历；**COW 时** 将 `frozen_fid` 登记到 **当前所有覆盖该 `(parent,name)` 的活跃快照**（由 snap 链/目录 Ref 解析一次，写入索引）。`deleteSnapshot`：对 `cow_inodes` 中每个 `fid` 从索引移除 `snap_id`，再 `--snap_ref_count`。 |
+| **B. 快照创建时预计算** | 在 `createSnapshot` O(1) 元数据之外，记录「该快照可见的 (parent,name)→fid 视图版本」；首次 COW 时用 **快照差分元数据** 得到 `K`，写入 `snap_ref_count` 与 `cow_inodes`。 |
+
+- **禁止**：`deleteSnapshot` 或 replay 时依赖 **未持久化的** 临时扫描结果且与主路径不一致。
+- **再建快照 `s3`（file1 已 frozen）**：`SnapshotRefIndex` 对 `frozen_fid` **insert(s3)** 并 `snap_ref_count++`（与上表「再建快照」行一致）。
+- **fsck**：`snap_ref_count == |SnapshotRefIndex[fid]|`（允许索引与 `cow_inodes` 并集交叉校验）。
+
+**成熟度说明**：引用计数为业界成熟手段，但须在 **COW 初值 / 多快照叠加 / delete + replay / 索引一致性** 上做 **专项测试**，列入 **P3.1 验收**。
 
 #### 6.3.4 与 §6.4 Checkpoint 的边界
 
@@ -662,6 +689,20 @@ triggerCheckpoint()   // 周期或 MetaCheckpoint RPC
 
 快照扫描慢 → `txn_id > N` 的 log 段变长 → **重启 replay 变长**；需控制 checkpoint 周期（配置 `meta.checkpoint.interval` 等）。
 
+#### 6.4.4 扫描期内存压力（已决）
+
+**问题**：后台遍历 `InodeTable`、`DirTable`（含 Large 目录 `kfstree` 流式导出）、`BlockMap` 时，若 **每 inode/每目录项分配独立序列化 buffer**，峰值内存可与 **瞬时分配速率 × 对象数** 成正比，挤压热路径 RSS。
+
+**决策**：
+
+| 措施 | 说明 |
+|------|------|
+| **Buffer 池** | 后台线程 **复用** 固定大小写缓冲（如 1–4 MiB），`section_*` 写满再 flush 到 FSImage 文件，避免 per-object `malloc` |
+| **扫描节流** | `meta.checkpoint.maxEntriesPerTick` / `maxBytesPerTick` 限制每时间片处理条数；`yield` 或短 sleep，避免与 mutating 抢满 CPU |
+| **Large 目录** | 按 `LeafIter` **流式** 写出 checkpoint 记录，**禁止** 先将百万 `MetaDentry` 载入单一 `vector` |
+| **背压** | 若 FSImage 写盘慢于扫描，队列深度有界；超限则 **拉长 checkpoint 周期** 而非无界堆内存 |
+| **可观测** | `checkpoint_scan_rss_delta`、`checkpoint_buffer_pool_bytes` 指标；压测：全量 namespace 扫描期间 CREATE p99 退化 **≤ 约定比例**（如 20%，P3 验收） |
+
 ### 6.5 与 QFS LogWriter / VR 的关系
 
 - **可复用**：quorum 复制、block 切分、primary lease、`MetaVrLogSeq` 序语义。
@@ -836,10 +877,12 @@ Phase E — 用户快照引用计数（§6.3.6）
 | fsck | §8.5 |
 | 大目录索引（Small/Large + promotion） | §4.2 |
 | Promotion 原子性与可见性 | §4.2.6 |
-| Readdir cookie 逻辑位置 | §5.4.1 |
+| Promotion 墙钟上限（写者饥饿） | §4.2.6 |
+| Readdir cookie 逻辑位置（`NameKey`） | §5.4.1 |
 | 用户快照（HDFS 式 Ref + 文件级 COW） | §6.3 |
-| Frozen inode `snap_ref_count` | §6.3.6 |
+| Frozen inode `snap_ref_count` + 倒排索引 | §6.3.6 |
 | Checkpoint/FSImage（一致性点 + 后台遍历） | §6.4 |
+| Checkpoint 扫描内存与节流 | §6.4.4 |
 
 后续若扩展 **多机分片、inode 换出**，另起 RFC。
 
@@ -906,3 +949,4 @@ Phase E — 用户快照引用计数（§6.3.6）
 | 1.1 | 2026-05-25 | §4.2.6 Promotion：`PROMOTING` 状态、staging、读 Small/写等待、原子发布 |
 | 1.2 | 2026-05-25 | §6.3.6：`snap_ref_count`、COW/删快照维护、fsck 与无环不变量 |
 | 1.3 | 2026-05-25 | §5.4.1：readdir cookie 用逻辑 key 游标，禁止 LeafIter/节点指针 |
+| 1.4 | 2026-05-25 | §4.2.6 晋升墙钟上限；§5.4.1 `last_key`；§6.3.6 倒排索引；§6.4.4 checkpoint 内存 |

From c3b38a24d02bf8936cf3c49c524dd1cfded56761 Mon Sep 17 00:00:00 2001
From: zhangzhibiao <zhangzhibiao@bigo.sg>
Date: Mon, 25 May 2026 19:59:31 +0800
Subject: [PATCH 6/7] update doc

---
 .../RFC-0001-memory-native-metadata-layer.md  | 213 ++++++++++++------
 1 file changed, 143 insertions(+), 70 deletions(-)

diff --git a/docs/rfc/RFC-0001-memory-native-metadata-layer.md b/docs/rfc/RFC-0001-memory-native-metadata-layer.md
index 2d86a134f..377191223 100644
--- a/docs/rfc/RFC-0001-memory-native-metadata-layer.md
+++ b/docs/rfc/RFC-0001-memory-native-metadata-layer.md
@@ -105,7 +105,7 @@ QFS **并非不能** 采用 HDFS NN 式布局；当前选择是 **统一 B+ 树
 ┌───────────────────────────▼─────────────────────────────┐
 │  持久化层                                                 │
 │  • Edit Log（二进制 op，组提交 fsync）                     │
-│  • 用户快照：HDFS 式引用 + 文件级 COW（§6.3）               │
+│  • 用户快照：Ref + COW + 目录 diff（§6.3）                 │
 │  • Checkpoint/FSImage：一致性点 N + 后台遍历（§6.4）          │
 │  • Quorum / VR 复制（复用现有 LogWriter/VR 基础设施）       │
 └─────────────────────────────────────────────────────────┘
@@ -115,11 +115,11 @@ QFS **并非不能** 采用 HDFS NN 式布局；当前选择是 **统一 B+ 树
 
 | 维度 | QFS 现状 | 本 RFC |
 |------|----------|--------|
-| 主索引 | 全局 B+ 树，dentry/fattr 不同 key | DirNode（Small hash / Large **复用 `kfstree` 每目录一棵 `Tree`**）+ InodeTable |
+| 主索引 | 全局 B+ 树，dentry/fattr 不同 key | DirNode（Small hash / Large **抽取/适配 `kfstree` 节点算法的目录局部 B+ 树**）+ InodeTable |
 | CREATE 索引操作 | 2× 全局 `insert` + 多次 `findLeaf` | 1× DirNode insert + 1× InodeTable insert |
 | 百万级单目录 | 同全局树叶子链/同桶冲突风险 | Large 布局 O(log N)，首版必做 promotion |
-| 持久化顺序 | 先 WAL committed，再 `handle()` | 临界区内改内存 + append log buffer；fsync 摊销 |
-| 用户快照 | （现 QFS 无同等机制） | `InodeRef` + 文件级 COW，创建 O(1)（§6.3） |
+| 持久化顺序 | 先 WAL committed，再 `handle()` | 临界区内写入 **pending 版本** + append log buffer；commit 后进入对外可见视图 |
+| 用户快照 | （现 QFS 无同等机制） | `InodeRef` + 文件级 COW + 目录 diff，创建 O(1)（§6.3） |
 | checkpoint | B+ 树页/节点序列化 | 模糊 FSImage（§6.4）+ replay txn>N |
 | chunk 元数据 | 同树 `KFS_CHUNKINFO` | BlockMap 分离，allocate 时再写 |
 
@@ -137,7 +137,7 @@ Inode {
   mode, uid, gid, size, mtime, ctime, atime
   nlink, flags               // 见 §8.4：WORM、dumpster 子树、striping 等
   snapshottable: bool        // 目录可打快照（§6.3）
-  snap_ref_count: u32        // 被用户快照持有的 frozen 引用数（§6.3.6）；live inode 常为 0
+  snap_ref_count: u32        // 被用户快照持有的 frozen 引用数（§6.3.4）；live inode 常为 0
   replication | ec_policy    // 或仅指针，详细布局在 allocate 时设置
   dir_child_count            // 仅目录；用于 readdir 分页提示
   generation:   u64          // 每次 rename/unlink/rmdir/promotion 递增，供 cache 失效
@@ -176,12 +176,12 @@ DirNode  = {
 
 #### 4.2.3 Large 布局（子项数 ≥ 阈值，或 Small 无法安全插入）
 
-**决策：Large 布局直接复用当前 QFS B+ 树实现**（`kfstree.h` / `kfstree.cc`），不新写一套目录 B-tree。与全局 `metatree` 的差异仅是 **每目录一棵独立 `Tree` 实例**，键空间 scoped 在该 `parent_fid` 下。
+**决策：Large 布局复用当前 QFS B+ 树的节点/Key/迭代算法**（`kfstree.h` / `kfstree.cc`），但实现上需要抽取或适配为 **目录局部 B+ 树组件**，而不是把现有 `Tree` 类原封不动实例化。现有 `Tree` 仍带有全局 namespace、checkpoint、dumpster、path cache 等语义；LargeDir 只需要其中的有序 dentry 索引能力。
 
 | 复用组件 | 路径 / 说明 |
 |----------|-------------|
 | 内部节点 | `Node`（`NKEY=170`，4096B 页式节点，`findplace` 二分，`split` / `merge`） |
-| 树操作 | `Tree::insert`、`Tree::del`、`lowerBound` / `findLeaf`、`LeafIter` |
+| 树操作 | 复用/抽取 `insert`、`del`、`lowerBound` / `findLeaf`、`LeafIter` 的节点算法 |
 | 键 | 现有 `Key` / `PartialMatch`；叶键 **`Key(KFS_DENTRY, parent_fid, name_hash)`**，与现 `MetaDentry::keySelf()` 一致 |
 | 叶记录 | `MetaDentry`（或薄封装 `DirBTreeLeaf` 内嵌相同字段）；`matchSelf` 比对 `name` |
 | 内存 | `MetaNode::allocate` / `PoolAllocator`（与现 meta 节点相同） |
@@ -189,17 +189,17 @@ DirNode  = {
 ```text
 LargeDir {
   parent_fid:  fid_t
-  tree:        Tree          // 现 kfstree.Tree，非全局 metatree 单例
+  tree:        DirBTree      // 从 kfstree 节点算法抽取/适配，非全局 metatree 单例
 }
 ```
 
 - **语义**：逻辑上仍是「该目录下 name → child_fid」；物理上用 **一棵子树** 存该目录全部 `MetaDentry` 叶，**不再**插入全局 `metatree` 的混合 key 空间。
-- **`lookup` / `create`**：对该目录的 `Tree` 调用与现 `getDentry` / `insert` 相同逻辑（`findLeaf` + 叶链 `peer()` 扫同名 hash），**O(log N)**，无百万长链。
+- **`lookup` / `create`**：对该目录的 `DirBTree` 调用与现 `getDentry` / `insert` 相同逻辑（`findLeaf` + 叶链 `peer()` 扫同名 hash），**O(log N)**，无百万长链。
 - **`readdir`**：`LeafIter` 逻辑序遍历 + §5.4 **逻辑位置 cookie**（禁止裸指针）。
 - **升格（promotion）**：原子性与并发语义见 **§4.2.6**（`PROMOTING` 状态、写阻塞、读仍用 Small、staging 完成后一次性切换）。
-- **checkpoint / fsck**：Large 目录序列化可 **复用现 Node/Meta checkpoint 格式**；fsck 见 §8.5（`PROMOTING` 视为 transient，持久化快照中不应出现）。
+- **checkpoint / fsck**：Large 目录可复用现 Node/Meta 的 **记录编码思路**，但需新增 `section_dirs.large` 外层元数据（`parent_fid`、layout、generation、child_count、记录数/校验和），不能直接把全局 `metatree` checkpoint 流嵌入；fsck 见 §8.5（`PROMOTING` 视为 transient，持久化快照中不应出现）。
 
-**不新写**：单独的目录 B-tree 节点类型、另一套 split/merge 或不同于 `Node` 页大小的树实现。
+**不重复造轮子**：split/merge、节点页大小、Key 排序、LeafIter 语义应尽量沿用 `kfstree`；但需要把全局 `Tree` 的非目录职责剥离出去。
 
 #### 4.2.4 复杂度与验收（百万级单目录）
 
@@ -216,15 +216,15 @@ LargeDir {
 | | 全局 `metatree`（现 QFS） | Large `DirNode`（本 RFC） |
 |--|---------------------------|---------------------------|
 | 代码 | `kfstree` | **同一套** `kfstree` |
-| 实例 | 单例 `metatree`，混放 dentry/fattr/chunk | **每超大目录一个 `Tree`** |
+| 实例 | 单例 `metatree`，混放 dentry/fattr/chunk | **每超大目录一个 `DirBTree`** |
 | create 副作用 | 可能 split 共享祖先内部节点 | 仅影响该目录子树 |
 | 小目录 | 也走全局树 | **Small `flat_hash`**，不进 B+ 树 |
 
-InodeTable、BlockMap **不再**进入任何 B+ 树；仅 **超大目录的子项列表** 使用 `Tree` 存 `MetaDentry` 叶。
+InodeTable、BlockMap **不再**进入任何 B+ 树；仅 **超大目录的子项列表** 使用 `DirBTree` 存 `MetaDentry` 叶。
 
 #### 4.2.6 晋升（Promotion）的原子性与可见性（已决）
 
-**问题**：§4.2.3 若在「半建成」的 Large `Tree` 上并发 `lookup`/`create`，可能看到 **不完整** 的 B+ 树或 Small/Large 双写混乱。
+**问题**：§4.2.3 若在「半建成」的 Large `DirBTree` 上并发 `lookup`/`create`，可能看到 **不完整** 的 B+ 树或 Small/Large 双写混乱。
 
 **决策**：`DirNode` 增加 **`state`**；晋升在 **staging** 中构建 Large，通过 **一次性发布** 切换；晋升期间 **读走 Small、写阻塞或排队**。
 
@@ -250,7 +250,7 @@ promote_small_to_large(parent_fid):
 
   1. assert(state == SMALL)
   2. state = PROMOTING
-  3. staging.large = new Tree()          // 读者不可见
+  3. staging.large = new DirBTree()      // 读者不可见
   4. for entry in body.small:            // 只读 Small，不改 Small
        staging.large.insert(MetaDentry(...))
   5. // 一次性发布（原子切换可见布局）
@@ -351,24 +351,30 @@ BlockMap : 按 fid 分片
 1. shard = hash(parent_fid) % N_SHARDS
 2. lock(DirShard[shard])
 3.   if DirIndex[parent].contains(name) → 处理 exclusive / truncate 语义
-4.   new_fid = FidAllocator.next()
-5.   DirTable[parent].insert(name, new_fid)   // §4.2，必要时 promotion
-6.   InodeTable[new_fid] = Inode{ parent, attrs... }
-7.   update parent.mtime, parent.file_count
-8.   txn = EditLog.append(CREATE, parent, name, new_fid, attrs, op_id)
-9. unlock
-10. if sync_policy == always: wait(txn.committed)
-11. return new_fid
+4.   txn_id = EditLog.reserve_txn()
+5.   new_fid = FidAllocator.next()
+6.   DirTable[parent].insert_pending(name, new_fid, create_txn=txn_id)   // §4.2，必要时 promotion
+7.   InodeTable[new_fid] = Inode{ parent, attrs..., create_txn=txn_id, delete_txn=none, pending=true }
+8.   update parent pending mtime / child_count version
+9.   EditLog.append_buffer(txn_id, CREATE, parent, name, new_fid, attrs, op_id)
+10. unlock
+11. if sync_policy == always: wait(txn_id.committed)
+12. return { new_fid, txn_id }
 ```
 
 **树操作次数**：0。持久化：1 条 edit（组提交时与其他 op 共享一次 fsync）。
 
+**可见性要求**：步骤 6–9 修改的是 **pending 版本**。普通 `LOOKUP`/`READDIR` 只暴露 `txn_id <= committed_txn_id` 的版本；同一客户端是否可读到自己的 pending create 由会话级 read-your-writes 选项单独定义，默认不向其它客户端暴露未提交 txn。
+
 ### 5.2 LOOKUP（单级）
 
 ```text
 lock(DirShard[hash(parent)])
-  entry = DirTable[parent].lookup(name)
+  entry = DirTable[parent].lookup_committed(name, committed_txn_id)
+  if entry == null: return ENOENT
   fa = InodeTable[entry.fid]
+  if fa.create_txn > committed_txn_id: return ENOENT
+  if fa.delete_txn != none and fa.delete_txn <= committed_txn_id: return ENOENT
 unlock
 → 权限检查
 ```
@@ -386,8 +392,8 @@ readdir(parent, cookie, max_entries) → 分页返回 DirEntry 列表
 
 | `DirNode.state` | 遍历方式 | cookie 概要 |
 |-----------------|----------|-------------|
-| **SMALL** | 桶序 + 桶内序 | 逻辑位置（§5.4.1） |
-| **PROMOTING** | 仍按 Small | 同 SMALL；`generation` 未变 |
+| **SMALL** | 按 `NameKey` 逻辑序（Small 有界，必要时临时排序） | 逻辑 key 游标（§5.4.1） |
+| **PROMOTING** | 仍按 Small 的 `NameKey` 逻辑序 | 同 SMALL；promotion 完成后 `generation++` |
 | **LARGE** | B+ 树 key 序（`kfstree`） | **逻辑 key 游标**，禁止节点指针 |
 
 - 每次 RPC 仅返回 **≤ max_entries**（默认上限如 1024，可配置）。
@@ -413,12 +419,13 @@ readdir(parent, cookie, max_entries) → 分页返回 DirEntry 列表
 CookieSmall = {
   generation:   u64
   layout:       SMALL | PROMOTING
-  bucket_id:    u32      // 开放寻址桶序号（稳定枚举顺序）
-  slot:         u32      // 桶内下一起始槽位
+  last_key:     NameKey?  // 上一页最后一条；空表示从头
 }
 ```
 
-- 仅在 **同一 `generation`、同一 Small 布局** 下有效；**promotion 完成** 后 `generation++`，旧 cookie **作废**（切换为 Large cookie 或从头）。
+- Small 不把开放寻址的 `bucket_id`/`slot` 暴露给 cookie；rehash、删除后的 tombstone 清理、Robin Hood 位移都会改变物理桶位置。
+- `readdir` 对 Small 使用 `NameKey` 逻辑序重定位；Small 有阈值上限（默认 4096），可在每页临时收集并排序，或维护有序 side index。
+- Small 上任意 `create`/`delete`/rehash 必须 `generation++`，旧 cookie 返回 `EINVAL` 并要求客户端重扫；promotion 完成同样 `generation++`，旧 Small cookie 作废。
 
 ##### LARGE（推荐：逻辑 key 游标）
 
@@ -465,6 +472,7 @@ CookieLargeAlt = { generation, leaf_node_id, index_in_leaf }
 | 事件 | cookie 行为 |
 |------|-------------|
 | **promotion 完成** | `generation++`；Small cookie **失效**；客户端用空 cookie 对 Large 重扫 |
+| **Small 上 create/delete/rehash** | `generation++`；Small cookie **失效**，避免开放寻址物理位置变化造成漏扫/重复 |
 | **rename/unlink/rmdir（目录）** | `generation++`；所有 cookie 失效 |
 | **Large 上 create/delete** | `generation` 可不变；**`last_key` cookie 仍有效**（靠 `NameKey` 重定位）；若产品要求列举快照视图，另议 |
 | **返回 `-EBADF`/`EINVAL`** | 客户端 **丢弃 cookie，从空重新开始** |
@@ -507,11 +515,11 @@ checksum per block / per record
 
 **Log 线程模型**：单写者 append + fsync；namespace 分片锁与 log 锁分离，缩短临界区。
 
-### 6.3 用户快照（已决：HDFS 式引用 + 文件级 COW）
+### 6.3 用户快照（已决：HDFS 式引用 + 文件级 COW + 目录 Diff）
 
 **放置说明**：本节描述 **Snapshottable 目录上的用户可见快照**（类比 HDFS `createSnapshot`），与 §6.4 **周期性 Checkpoint/FSImage**（NN 冷备）分工不同。实现可落在 **P3/P3.1**（§9）。
 
-**决策：创建快照采用 HDFS 核心思路——引用（Rename/Reference）而非复制；修改采用文件级写时复制（COW）。** 不采用对整棵树做全量内存扫描来「创建」用户快照（该做法保留给 §6.4 Checkpoint）。
+**决策：创建快照采用 HDFS 核心思路——引用（Rename/Reference）而非复制；文件内容修改采用文件级写时复制（COW）；目录项变化必须记录目录 diff。** 不采用对整棵树做全量内存扫描来「创建」用户快照（该做法保留给 §6.4 Checkpoint）。
 
 #### 6.3.1 核心机制（对齐 HDFS）
 
@@ -541,14 +549,38 @@ mutate(file1):
 ```
 
 - **目录级百万文件**：创建 `s1` **不遍历** `DirTable`；仅在被修改的单个文件上支付 COW（约一次 create + 后续 write 的元数据开销）。
-- **Large 目录**：COW 只 **`replace_name` 一条 DirEntry**（Small 或 `kfstree` 单键更新），不重扫整棵 per-dir `Tree`。
+- **Large 目录**：COW 只 **`replace_name` 一条 DirEntry**（Small 或 `DirBTree` 单键更新），不重扫整棵 per-dir `DirBTree`。
+
+#### 6.3.1.1 目录 Diff（必需）
+
+仅有 `InodeRef + 文件级 COW` **不足以**提供用户快照的时间点语义：快照创建后，live 目录里的 `create`、`unlink`、`rename` 若直接修改 `DirTable`，快照读会跟着变化。首版快照必须同时实现 **目录级 diff**（对齐 HDFS snapshot diff 思路），记录快照创建点之后每个 snapshottable 子树内的目录项变化。
+
+```text
+DirSnapshotDiff {
+  dir_fid, snap_id, base_txn
+  created:  set<NameKey>                 // 快照之后新建，快照视图不可见
+  deleted:  map<NameKey, frozen_fid>      // 快照之后删除/rename out，快照视图仍可见
+  renamed:  optional oldName -> newName   // 可展开为 deleted+created
+}
+```
+
+规则：
+
+- `createSnapshot(D)`：只创建根 `SnapshotRecord`，不遍历百万子项；目录 diff 延迟到后续 mutation 时按需创建。
+- `create(parent, name)`：若 parent 被某个活跃快照覆盖，在对应 `DirSnapshotDiff.created` 记录 `name`，使该快照视图过滤掉新名字。
+- `unlink/rename out(parent, name)`：若被快照覆盖，先冻结当前 `child_fid`（文件按 §6.3.4；目录需冻结目录引用和后续 diff 链），在 `deleted[name]` 记录 frozen 引用，快照视图继续返回旧条目。
+- `rename across dirs`：按源目录 `deleted` + 目标目录 `created` 处理；必须与 §7.3 锁顺序一致。
+- `readdir(snapshot)`：以 live DirIndex 为基底叠加 diff：过滤 `created`，补回 `deleted`，并按 `NameKey` 逻辑序输出；Large 目录仍使用 `DirBTree` lowerBound，再 merge diff 项。
+
+没有目录 diff 时，§6.3 的用户快照只能算 inode 引用缓存，不能作为可恢复的目录快照交付。
 
 #### 6.3.2 性能预期（与 HDFS 对照）
 
 | 场景 | 性能 | 原因 |
 |------|------|------|
 | 读活动/读快照文件 | 快照读无额外锁；活动读与无快照相同 | Ref 只读解析 |
-| 创建 / 删除快照 | **近似 O(1)** | 仅增删 `SnapshotRecord` / `InodeRef` |
+| 创建快照 | **O(1)** | 仅新增 `SnapshotRecord` / `InodeRef`，不遍历子树 |
+| 删除快照 | O(本快照登记的 frozen/diff 项) | 释放 `cow_inodes`、`dir_diffs.deleted` 与倒排索引引用 |
 | 首次修改快照覆盖下的文件 | 有开销（COW 一个 inode） | 与被修改文件数成正比，与目录总规模无关 |
 | 再次修改已 COW 过的活动文件 | 与无快照相同 | 已操作活动侧新 inode |
 
@@ -558,16 +590,17 @@ mutate(file1):
 SnapshotRecord {
   snap_id, name, root_dir_fid, txn_id_at_create: N
   root_ref: InodeRef              // O(1) 创建：指向 snapshottable 根目录 inode
-  cow_inodes:  set<fid_t>         // 可选：本快照触发的 frozen fid 登记，便于 delete 时递减
+  cow_inodes:  set<fid_t>         // 本快照引用的 frozen fid，便于 delete 时递减
+  dir_diffs:   map<dir_fid, DirSnapshotDiff> // 本快照目录项变化
 }
 
 InodeRef { target_fid, txn_id_cap }
 ```
 
-- **`snap_ref_count`** 定义在 §4.1 `Inode` 上：表示有多少 **独立快照引用** 仍依赖该 **inode 对象**（通常为 COW 后的 **frozen** 副本；活动/live inode 在分裂后一般为 0）。
-- Edit log：`SNAPSHOT_CREATE`、`SNAPSHOT_DELETE`、`INODE_COW_SPLIT`（含 `frozen_fid`、`live_fid`、`snap_ref_delta`），供 standby **确定性 replay**。
+- **`snap_ref_count`** 定义在 §4.1 `Inode` 上：表示有多少 **独立快照引用** 仍依赖该 **inode 对象**（通常为 COW 后的 **frozen** 副本；活动/live inode 在分裂后一般为 0）。目录项时间点语义由 §6.3.1.1 的 `DirSnapshotDiff` 维护，inode 引用计数只解决 frozen inode 生命周期。
+- Edit log：`SNAPSHOT_CREATE`、`SNAPSHOT_DELETE`、`INODE_COW_SPLIT`（含 `frozen_fid`、`live_fid`、`snap_ref_delta`）、`DIR_SNAPSHOT_DIFF_UPDATE`，供 standby **确定性 replay**。
 
-#### 6.3.6 Frozen inode 引用计数（已决）
+#### 6.3.4 Frozen inode 引用计数（已决）
 
 **问题**：§6.3 删除快照时「仅回收本快照专属的 frozen inode」。若同一 frozen inode 被 **多个快照** 引用（例如 `/foo` 上连续创建 `s1`、`s2` 后才首次修改 `file1`），**不能在 `snap_ref_count > 0` 时释放**。
 
@@ -580,7 +613,7 @@ InodeRef { target_fid, txn_id_cap }
 | **`createSnapshot`** | 根目录 `root_ref.target` **+1**（可选） | 创建本身 O(1)；**不**遍历子树给每个文件 +1。未 COW 的文件仍与 live 共用同一 `fid`，读快照走解析路径。 |
 | **首次 `COW_SPLIT`（file1）** | 对 **frozen_fid**（旧 inode 副本）设为 **覆盖该文件的所有活跃快照数** `K` | 例：存在 `s1`、`s2` 均可见 `file1` 时尚未修改 → `frozen.snap_ref_count = 2`。活动侧新 `live_fid`：`snap_ref_count = 0`。 |
 | **再建快照 `s3`（已有 frozen file1）** | 若 `s3` 仍指向含 `file1` 的视图且 `file1` 已 frozen：对 `frozen_fid` **+1** | 仅影响 **已分裂** 的 frozen 对象；仍与 live 共用的路径在首次 COW 时一次性结算。 |
-| **`deleteSnapshot(s)`** | 对该快照登记过的每个 `frozen_fid`：**-1** | 来自 `cow_inodes` 或快照元数据索引；**仅当减到 0** 时 `free_inode(frozen_fid)` + 释放 BlockMap |
+| **`deleteSnapshot(s)`** | 对该快照登记过的每个 `frozen_fid`：**-1** | 来自 `cow_inodes`、`dir_diffs.deleted` 或快照元数据索引；**仅当减到 0** 时 `free_inode(frozen_fid)` + 释放 BlockMap |
 | **活动路径修改 live inode** | 不增减 | live 与快照引用解耦 |
 
 ```text
@@ -595,8 +628,13 @@ cow_split(file_fid, parent, name):
 
 deleteSnapshot(snap_id):
   for fid in snap.cow_inodes:
-    if (--InodeTable[fid].snap_ref_count == 0)
+    if snapshot_ref_index.remove(fid, snap_id) and --InodeTable[fid].snap_ref_count == 0
       free_inode_and_blockmap(fid)
+  for diff in snap.dir_diffs:
+    for fid in diff.deleted.values:
+      if snapshot_ref_index.remove(fid, snap_id) and --InodeTable[fid].snap_ref_count == 0
+        free_inode_and_blockmap(fid)
+  release snap.dir_diffs
   remove SnapshotRecord
   append EditLog(SNAPSHOT_DELETE, snap_id, ...)
 ```
@@ -619,9 +657,11 @@ deleteSnapshot(snap_id):
 
 ##### fsck（§8.5 扩展）
 
-- 对每个 `snap_ref_count > 0` 的 inode：存在至少一条 `SnapshotRecord` / `cow_inodes` 反向引用。
+- 对每个 `snap_ref_count > 0` 的 inode：存在至少一条 `SnapshotRecord` / `cow_inodes` / `dir_diffs.deleted` / `SnapshotRefIndex` 反向引用。
 - 对每个 `SnapshotRecord.cow_inodes` 中的 `fid`：`snap_ref_count >= 1`。
-- 删除快照后的 spot check：`cow_inodes` 中不应出现已 free 的 `fid`。
+- 对每个 `DirSnapshotDiff.deleted` 中的 `frozen_fid`：inode 存在，`snap_ref_count >= 1`，且 `SnapshotRefIndex[frozen_fid]` 包含该 `snap_id`。
+- `snap_ref_count == |SnapshotRefIndex[fid]|`；允许再与所有 `cow_inodes`、`dir_diffs.deleted` 的并集交叉校验。
+- 删除快照后的 spot check：`cow_inodes` / `dir_diffs.deleted` 中不应出现已 free 的 `fid`。
 
 ##### `count_snapshots_covering` 与倒排索引（已决）
 
@@ -631,27 +671,27 @@ deleteSnapshot(snap_id):
 
 | 方案 | 做法 |
 |------|------|
-| **A. 倒排索引（推荐）** | 维护 `SnapshotRefIndex: frozen_fid → { snap_id... }`（及可选 `(parent,name) → frozen_fid`）。`createSnapshot`：对仍与 live 共用的路径 **不** 预遍历；**COW 时** 将 `frozen_fid` 登记到 **当前所有覆盖该 `(parent,name)` 的活跃快照**（由 snap 链/目录 Ref 解析一次，写入索引）。`deleteSnapshot`：对 `cow_inodes` 中每个 `fid` 从索引移除 `snap_id`，再 `--snap_ref_count`。 |
+| **A. 倒排索引（推荐）** | 维护 `SnapshotRefIndex: frozen_fid → { snap_id... }`（及可选 `(parent,name) → frozen_fid`）。`createSnapshot`：对仍与 live 共用的路径 **不** 预遍历；**COW / 目录 diff 产生 frozen 引用时** 将 `frozen_fid` 登记到 **当前所有覆盖该 `(parent,name)` 的活跃快照**（由 snap 链/目录 Ref 解析一次，写入索引）。`deleteSnapshot`：对 `cow_inodes` 与 `dir_diffs.deleted` 中每个 `fid` 从索引移除 `snap_id`，再 `--snap_ref_count`。 |
 | **B. 快照创建时预计算** | 在 `createSnapshot` O(1) 元数据之外，记录「该快照可见的 (parent,name)→fid 视图版本」；首次 COW 时用 **快照差分元数据** 得到 `K`，写入 `snap_ref_count` 与 `cow_inodes`。 |
 
 - **禁止**：`deleteSnapshot` 或 replay 时依赖 **未持久化的** 临时扫描结果且与主路径不一致。
 - **再建快照 `s3`（file1 已 frozen）**：`SnapshotRefIndex` 对 `frozen_fid` **insert(s3)** 并 `snap_ref_count++`（与上表「再建快照」行一致）。
-- **fsck**：`snap_ref_count == |SnapshotRefIndex[fid]|`（允许索引与 `cow_inodes` 并集交叉校验）。
+- **fsck**：`snap_ref_count == |SnapshotRefIndex[fid]|`（允许索引与 `cow_inodes`、`dir_diffs.deleted` 并集交叉校验）。
 
-**成熟度说明**：引用计数为业界成熟手段，但须在 **COW 初值 / 多快照叠加 / delete + replay / 索引一致性** 上做 **专项测试**，列入 **P3.1 验收**。
+**成熟度说明**：引用计数为业界成熟手段，但须在 **COW 初值 / 目录 diff frozen 引用 / 多快照叠加 / delete + replay / 索引一致性** 上做 **专项测试**，列入 **P3.1 验收**。
 
-#### 6.3.4 与 §6.4 Checkpoint 的边界
+#### 6.3.5 与 §6.4 Checkpoint 的边界
 
 | | §6.3 用户快照 | §6.4 Checkpoint/FSImage |
 |--|----------------|-------------------------|
 | 目的 | 时间点恢复、误删回滚、对比历史 | MetaServer **重启/冷备**、缩短 replay |
 | 创建成本 | **O(1)** per snap | O(namespace) 后台扫描（可模糊） |
 | 读路径 | 快照视图 | 正常命名空间 |
-| 存储 | 内存 Ref + 被 COW 分离的 inode | 磁盘 FSImage 文件 |
+| 存储 | 内存 Ref + 目录 diff + 被 COW 分离的 inode | 磁盘 FSImage 文件 |
 
 两者可同时存在：HDFS 亦区分 **Snapshot** 与 **Checkpoint（FSImage）**。
 
-#### 6.3.5 未采纳为用户快照的方案
+#### 6.3.6 未采纳为用户快照的方案
 
 | 方案 | 结论 |
 |------|------|
@@ -671,18 +711,21 @@ deleteSnapshot(snap_id):
 triggerCheckpoint()   // 周期或 MetaCheckpoint RPC
   ├─ 记录 LAST_TXN_ID = committed_txn_id  （N）
   ├─ 后台线程遍历 InodeTable、DirTable（§4）、BlockMap（可选）
-  │     允许与写并发；图像可「模糊」
-  ├─ 写出 FSImage + footer(N)
+  │     允许与写并发，但只序列化 txn_id <= N 的 committed 视图
+  │     忽略 pending txn>N；对 delete_txn>N 的旧版本仍按 N 时刻保留
+  ├─ 写出 FSImage + footer(N)；每个 section 带 section checksum 和 max_txn_seen<=N
   └─ 原子 publish
 
 冷启动：load FSImage(N) → replay Edit Log (txn_id > N) → 一致
 ```
 
-正确性：依赖 §6.7 之「先 Edit Log 再内存 / committed 边界」；模糊项由 replay 修正（同原 §6.3.2 论证）。
+正确性：依赖 §6.7 的版本化可见性边界。Checkpoint 可以与写并发，但 **不能**把 txn>N 的新 dentry/inode 写入 FSImage(N)，否则冷启动 replay(txn>N) 会重复 create、复活已删除对象或双加计数。实现必须在扫描时按 `create_txn <= N < delete_txn` 过滤，或在 FSImage 记录中携带版本并在 load 阶段过滤。
 
 #### 6.4.2 FSImage 内容与 Large 目录
 
-- `section_inodes`、`section_dirs`（Small 桶或嵌入 **`kfstree` checkpoint 流**）。
+- `section_inodes`、`section_dirs`（Small 逻辑项或 Large `DirBTree` 记录流），记录必须带可过滤的 create/delete txn 或保证已经按 N 过滤。
+- `section_snapshots`：只写 `create_txn <= N` 且未在 N 前删除的 `SnapshotRecord`，包括 `cow_inodes`、`dir_diffs` 与可重建 `SnapshotRefIndex` 的记录；`DIR_SNAPSHOT_DIFF_UPDATE` 中 txn>N 的变化不得进入 FSImage(N)。
+- `section_blockmap`（可选）：若写入，则与 inode 一样按版本过滤，避免 replay 后重复块引用计数。
 - log 截断：**可选**运维操作，非恢复前提。
 
 #### 6.4.3 代价
@@ -714,11 +757,13 @@ triggerCheckpoint()   // 周期或 MetaCheckpoint RPC
 **提议默认顺序**：
 
 ```text
-（分片锁内）改内存 → append 到 log 内存 buffer → 释放锁
+reserve txn_id
+（分片锁内）写 pending 版本 → append 到 log 内存 buffer → 释放锁
 （log 线程）buffer → 复制 → fsync → 推进 committed_txn_id
+（发布阶段）txn_id <= committed_txn_id 的 pending 版本进入 committed 视图
 ```
 
-对比 QFS：**先 log committed 再 `handle()`**，客户端等待包含「空窗期」内无法从内存读到结果的双重延迟。本 RFC 的可见性边界见 **§6.7**：其他客户端以 **已提交命名空间** 为准；发起方在 RPC 成功后的可见范围与 **lease / sync 策略** 对齐 HDFS 习惯，而非「未提交 txn 全网可见」。
+对比 QFS：**先 log committed 再 `handle()`**，客户端等待包含「空窗期」内无法从内存读到结果的双重延迟。本 RFC 将内存修改拆成 **pending 版本** 与 **committed 视图**：写路径可以先构造 pending 状态并排队 fsync，但普通读路径只能读 committed 视图。发起方在 RPC 成功后的可见范围与 **lease / sync 策略** 对齐 HDFS 习惯，而非「未提交 txn 全网可见」。
 
 ### 6.7 读一致性（已决）
 
@@ -726,8 +771,8 @@ triggerCheckpoint()   // 周期或 MetaCheckpoint RPC
 
 | 场景 | 规则 |
 |------|------|
-| **命名空间变更**（CREATE / REMOVE / RENAME …） | 对其他客户端：仅在 edit **已 committed**（`committed_txn_id` 推进、quorum 复制完成）后可见；primary 内存中未 fsync 的 buffer **不**对外暴露。 |
-| **RPC 返回与 durable** | `sync=always`：成功返回 ≡ 命名空间变更已 durable，他客户端可见（在 primary 正常服务前提下）。`sync=batch`：返回表示 **已接受**；他客户端可见时点不早于本批 **组提交 fsync**（类比 HDFS edit 组提交窗口）。 |
+| **命名空间变更**（CREATE / REMOVE / RENAME …） | 对其他客户端：仅在 edit **已 committed**（`committed_txn_id` 推进、quorum 复制完成）后可见；primary 内存中的 pending 版本 **不**进入普通读视图。 |
+| **RPC 返回与 durable** | `sync=always`：成功返回 ≡ 命名空间变更已 durable，他客户端可见（在 primary 正常服务前提下）。`sync=batch`：返回表示 **已接受并分配 txn/fid**；他客户端可见时点不早于本批 **组提交 fsync**。若需要 read-your-writes，必须用会话 token 或等待 txn committed。 |
 | **文件数据读写** | 命名空间登记（create 得 fid）与 **写数据** 分离；已打开文件的读写一致性由 **chunk lease** 保证写者独占/租约续期，读者看到已提交块版本，与 HDFS 「NN 管名字、DN 管块 + lease」分工一致。 |
 | **Primary / standby** | 仅 primary 执行 namespace 变更并写 edit；standby 通过 log replay 追赶；客户端 mutating 与强一致命名空间读面向 primary（与现 VR 一致）。 |
 
@@ -736,7 +781,7 @@ triggerCheckpoint()   // 周期或 MetaCheckpoint RPC
 - **(a) 仅 primary 本地可见未提交变更**：不足以定义多客户端语义，且与 backup 复制模型冲突。
 - **(b) 未提交 txn 全网可见**：破坏恢复与 fsck 假设，并引入跨客户端脏读。
 
-**实现提示**：可在 `Inode` 或目录上保留 `last_committed_txn`；`lookup` / `readdir` 仅暴露 `txn_id ≤ committed_txn_id` 的视图；写路径 lease 逻辑复用现有 QFS 实现，本层不新增第二套租约协议。
+**实现提示**：DirEntry/Inode 需要携带 `create_txn`、`delete_txn`（或等价版本区间）与 pending 标志；`lookup` / `readdir` 只暴露 `create_txn <= committed_txn_id < delete_txn` 的视图。commit 发布可以批量翻转 pending，也可以只推进全局 `committed_txn_id` 并在读路径过滤。写路径 lease 逻辑复用现有 QFS 实现，本层不新增第二套租约协议。
 
 ---
 
@@ -750,14 +795,36 @@ triggerCheckpoint()   // 周期或 MetaCheckpoint RPC
 | `InodeTable` | `hash(fid) % M` 分片；读多写少用 RW lock |
 | `FidAllocator` | 无锁原子或独立 mutex |
 | `EditLog buffer` | 单写者 + MPSC 队列 |
-| `PathCache` | RCU 或 per-shard 锁 |
+| 客户端 `PathCache` | 客户端本地缓存，不在 MetaServer 锁层次内 |
 
 **禁止**：所有 mutating RPC 共用一个 `submit_request` 全局 mutex（现状瓶颈）。
 
 ### 7.2 与 B+ 树分片锁的区别
 
 对 **全局 `metatree`（单例 B+ 树）**，「按 parent 加锁」**不安全**（不同目录可能 split 同一内部节点，见 `MetaTree-Lock-Optimization.md`）。  
-对 **DirTable 分片**：按 `parent_fid` 加锁 **安全**——Small 为独立 `flat_hash`；Large 为 **该目录专属 `Tree` 实例**（仍用 `kfstree`，但不与别目录共享内部节点）。
+对 **DirTable 分片**：按 `parent_fid` 加锁 **安全**——Small 为独立 `flat_hash`；Large 为 **该目录专属 `DirBTree` 实例**（抽取/适配 `kfstree` 节点算法，但不与别目录共享内部节点）。
+
+### 7.3 跨目录操作锁顺序（已决）
+
+`rename`、dumpster move、快照 COW / 目录 diff 更新会同时触碰多个目录、inode、BlockMap 与快照元数据，必须使用全局确定性锁顺序，禁止按调用路径临时加锁。
+
+**锁顺序**：
+
+```text
+1. SnapshotRegistry / SnapshotRefIndex 元数据锁（仅快照相关操作）
+2. DirShard locks，按 (shard_id, parent_fid) 升序；同一目录只加一次
+3. InodeTable locks，按 fid 升序
+4. BlockMap locks，按 fid 升序
+5. EditLog append buffer（只追加内存 buffer，不在锁内等待 fsync）
+```
+
+规则：
+
+- `RENAME(src_parent, name, dst_parent, new_name)`：先按 `(shard_id, parent_fid)` 顺序拿源/目标父目录写锁；在锁内重新校验源项存在、目标项冲突、权限和 generation；再写入同一个 txn 的 pending `delete(src)` + `create(dst)`，并更新 inode parent/name 与目录 diff。
+- `remove(..., todumpster=true)`：视为从源父目录 rename 到 `dumpster_fid`，按同一 DirShard 顺序加锁，不给 dumpster 单独开后门锁。
+- 快照 COW / diff：先拿 snapshot 元数据锁，确定受影响的 `snap_id` / `DirSnapshotDiff` / `SnapshotRefIndex`，再按目录和 fid 顺序加锁；不得持有低层锁后再回头等待 snapshot 元数据锁。
+- 冲突处理：多资源操作使用 `try_lock` + 释放已持有锁 + 退避重试，避免 ABBA；禁止在持有另一把目录锁时做读锁升级为写锁。
+- `EditLog` 只在已完成内存 pending 版本后 append buffer；`fsync` / quorum 等待发生在释放业务锁之后。
 
 ---
 
@@ -823,7 +890,7 @@ Phase A — InodeTable
 Phase B — DirTable（每个目录 fid）
   按 DirNode.layout 枚举：
     - **SMALL**：flat_hash 全桶扫描，校验无重复 NameKey、探测链有界
-    - **LARGE**：遍历该目录专属 `Tree` 叶（同现 `kfstree` 迭代），校验 `Key(KFS_DENTRY, parent, hash)` 与 name 唯一
+    - **LARGE**：遍历该目录专属 `DirBTree` 叶（同现 `kfstree` 迭代），校验 `Key(KFS_DENTRY, parent, hash)` 与 name 唯一
   对每条 DirEntry (name → child_fid)：
     - InodeTable[child_fid] 存在且 parent_fid == 当前目录 fid
   对 InodeTable 中 type=dir 的项：
@@ -839,10 +906,12 @@ Phase C — 双向一致
 Phase D — 与 edit committed 视图一致（可选在线 fsck）
   仅扫描 txn_id ≤ committed_txn_id 的视图（§6.7）
 
-Phase E — 用户快照引用计数（§6.3.6）
+Phase E — 用户快照与目录 diff（§6.3.1.1 / §6.3.4）
   - 对每个 SnapshotRecord：cow_inodes 中 fid 存在且 snap_ref_count >= 1
-  - 对每个 snap_ref_count > 0 的 inode：至少被一个 SnapshotRecord.cow_inodes 引用
-  - 无 snap_ref_count == 0 且仅被快照元数据悬挂的 unreachable frozen
+  - 对每个 DirSnapshotDiff：所属 snap_id 存在，base_txn <= committed_txn_id，created/deleted 的 NameKey 无重复
+  - 对每个 DirSnapshotDiff.deleted 中的 frozen_fid：inode 存在，snap_ref_count >= 1，SnapshotRefIndex 包含该 snap_id
+  - 对每个 snap_ref_count > 0 的 inode：至少被 SnapshotRecord.cow_inodes、dir_diffs.deleted 或 SnapshotRefIndex 引用
+  - snap_ref_count == |SnapshotRefIndex[fid]|；无 snap_ref_count == 0 且仅被快照元数据悬挂的 unreachable frozen
 ```
 
 报告格式可继续兼容现 `MetaFsck` / `kfsfsck` 客户端字段；内部扫描源从 `metatree` 迭代改为 **InodeTable + DirIndex 枚举**。
@@ -858,7 +927,7 @@ Phase E — 用户快照引用计数（§6.3.6）
 | **P2** | `DirTable`（§4.2 Small+Large+promotion）+ `InodeTable`；百万级单目录基准 | 高 |
 | **P2.1** | §8.4 特殊路径 + §8.5 fsck（含两种 DirNode layout） | 可运维 |
 | **P3** | v2 edit + §6.4 Checkpoint（FSImage N + replay）+ 冷启动闭环 | 很高 |
-| **P3.1** | §6.3 用户快照 + §6.3.6 `snap_ref_count`（COW/delete/replay/fsck 测试） | 可回滚目录 |
+| **P3.1** | §6.3 用户快照 + 目录 Diff + §6.3.4 `snap_ref_count`（COW/delete/replay/fsck 测试） | 可回滚目录 |
 
 （**范围外**：多 MetaServer namespace 分片、BlockMap 独立服务、inode 换出等，不列入本 RFC 路线图。）
 
@@ -879,8 +948,11 @@ Phase E — 用户快照引用计数（§6.3.6）
 | Promotion 原子性与可见性 | §4.2.6 |
 | Promotion 墙钟上限（写者饥饿） | §4.2.6 |
 | Readdir cookie 逻辑位置（`NameKey`） | §5.4.1 |
-| 用户快照（HDFS 式 Ref + 文件级 COW） | §6.3 |
-| Frozen inode `snap_ref_count` + 倒排索引 | §6.3.6 |
+| 用户快照（HDFS 式 Ref + 文件级 COW + 目录 Diff） | §6.3 |
+| Frozen inode `snap_ref_count` + 倒排索引 | §6.3.4 |
+| 目录快照 Diff | §6.3.1.1 |
+| Pending / committed 视图 | §6.6 / §6.7 |
+| 跨目录锁顺序 | §7.3 |
 | Checkpoint/FSImage（一致性点 + 后台遍历） | §6.4 |
 | Checkpoint 扫描内存与节流 | §6.4.4 |
 
@@ -895,7 +967,7 @@ Phase E — 用户快照引用计数（§6.3.6）
 | 保留全局 B+ 树，仅优化锁 | 无法消除双 insert 与树分裂；并发上限低（见 `MetaTree-Lock-Optimization.md`） |
 | 仅全局 B+ 树 | 已否决；见 §4.2.5 |
 | 单目录百万项仍用平铺 HashMap+链表 | **已否决**；首版必须 Large 布局 + promotion |
-| 每目录一棵 B+ 树（Large 布局） | **已采纳**，**复用 `kfstree`**，仅用于 `child_count ≥ threshold` 的目录 |
+| 每目录一棵 B+ 树（Large 布局） | **已采纳**，**抽取/适配 `kfstree` 节点算法**，仅用于 `child_count ≥ threshold` 的目录 |
 | 自研另一套目录 B-tree 实现 | **已否决**，与现网重复且难保持 checkpoint 一致 |
 | 纯 tmpfs、无持久化 | 不符合 QFS 定位 |
 | 完全照搬 RocksDB/LSM 存 namespace | 写放大与 create 延迟不如 hash + edit log 直接 |
@@ -920,7 +992,7 @@ Phase E — 用户快照引用计数（§6.3.6）
 
 ## 13. 参考文献（仓库内）
 
-- `src/cc/meta/kfstree.h` / `kfstree.cc` — B+ 树（Large 目录 **复用** 本实现；全局 `metatree` 不再用于 namespace dentry）
+- `src/cc/meta/kfstree.h` / `kfstree.cc` — B+ 树（Large 目录 **抽取/适配** 节点算法；全局 `metatree` 不再用于 namespace dentry）
 - `src/cc/meta/kfsops.cc` — `Tree::create` / `link` 双 `insert`
 - `src/cc/meta/MetaRequest.cc` — `MetaCreate::start` / `handle`，`SubmitBegin`
 - `src/cc/meta/LogWriter.cc` — `Enqueue`、`WriteLog`、`fsync`
@@ -943,10 +1015,11 @@ Phase E — 用户快照引用计数（§6.3.6）
 | 0.5 | 2026-05-25 | 范围限定单机内存；去掉 namespace 水平分片/换出开放项与 P4 路线图 |
 | 0.6 | 2026-05-25 | §8.4 特殊路径、§8.5 fsck 已决；§10 无剩余开放项 |
 | 0.7 | 2026-05-25 | §4.2 大目录首版必做：Small flat_hash + Large 每目录 B+ 树 + promotion |
-| 0.8 | 2026-05-25 | Large 布局明确复用现 `kfstree`（`Tree`/`Node`/`Key`/`MetaDentry`），不新写 B-tree |
+| 0.8 | 2026-05-25 | Large 布局明确抽取/适配现 `kfstree` 节点算法（`Node`/`Key`/`MetaDentry`），不新写 B-tree |
 | 0.9 | 2026-05-25 | §6.3 已决：一致性点 + 后台模糊 FSImage + replay(txn>N) |
 | 1.0 | 2026-05-25 | §6.3 改为 HDFS 式用户快照（InodeRef+文件级COW）；§6.4 为 Checkpoint/FSImage |
 | 1.1 | 2026-05-25 | §4.2.6 Promotion：`PROMOTING` 状态、staging、读 Small/写等待、原子发布 |
-| 1.2 | 2026-05-25 | §6.3.6：`snap_ref_count`、COW/删快照维护、fsck 与无环不变量 |
+| 1.2 | 2026-05-25 | §6.3.4：`snap_ref_count`、COW/删快照维护、fsck 与无环不变量 |
 | 1.3 | 2026-05-25 | §5.4.1：readdir cookie 用逻辑 key 游标，禁止 LeafIter/节点指针 |
-| 1.4 | 2026-05-25 | §4.2.6 晋升墙钟上限；§5.4.1 `last_key`；§6.3.6 倒排索引；§6.4.4 checkpoint 内存 |
+| 1.4 | 2026-05-25 | §4.2.6 晋升墙钟上限；§5.4.1 `last_key`；§6.3.4 倒排索引；§6.4.4 checkpoint 内存 |
+| 1.5 | 2026-05-25 | 补充 pending/committed 视图、目录快照 diff、checkpoint 版本过滤、跨目录锁顺序 |

From a4dfe4205247582af58fd337673055377736b674 Mon Sep 17 00:00:00 2001
From: zhangzhibiao <zhangzhibiao@bigo.sg>
Date: Mon, 1 Jun 2026 16:26:01 +0800
Subject: [PATCH 7/7] Optimize QFS metadata and write path

---
 benchmarks/mstress/mstress_client.cc          |  140 +-
 clean_start_cluster.sh                        |   49 +
 docs/rfc/RFC-0002-hdfs-like-write-flow.md     |  161 +
 ...FC-0003-write-path-optimization-summary.md |  217 ++
 press.sh                                      |   11 +
 src/cc/chunk/ChunkManager.cc                  |  120 +-
 src/cc/chunk/ChunkManager.h                   |    6 +
 src/cc/chunk/ClientSM.cc                      |    7 +-
 src/cc/chunk/KfsOps.cc                        |  145 +-
 src/cc/chunk/KfsOps.h                         |   48 +-
 src/cc/chunk/LeaseClerk.cc                    |    2 +-
 src/cc/common/BufferedLogWriter.cc            |    2 +-
 src/cc/kfsio/IOBuffer.cc                      |   23 +
 src/cc/kfsio/IOBuffer.h                       |    4 +
 src/cc/libclient/KfsClient.cc                 |  262 +-
 src/cc/libclient/KfsClientInt.h               |   25 +-
 src/cc/libclient/KfsOps.cc                    |   29 +-
 src/cc/libclient/KfsOps.h                     |   24 +-
 src/cc/libclient/KfsProtocolWorker.cc         |    8 +-
 src/cc/libclient/KfsProtocolWorker.h          |    7 +-
 src/cc/libclient/WriteAppender.cc             |    2 +
 src/cc/libclient/Writer.cc                    |  736 +++-
 src/cc/libclient/Writer.h                     |   64 +-
 src/cc/meta/CMakeLists.txt                    |   19 +-
 src/cc/meta/Checkpoint.cc                     |    4 +
 src/cc/meta/ChunkServer.cc                    |   32 +-
 src/cc/meta/ChunkServer.h                     |    1 +
 src/cc/meta/ClientManager.h                   |    4 +
 src/cc/meta/ClientSM.h                        |    2 +
 src/cc/meta/LayoutManager.cc                  |  126 +-
 src/cc/meta/LayoutManager.h                   |    5 +
 src/cc/meta/LogWriter.cc                      |  337 +-
 src/cc/meta/MetaRequest.cc                    |  630 +++-
 src/cc/meta/MetaRequest.h                     |   50 +-
 src/cc/meta/NamespaceV2.cc                    | 2969 +++++++++++++++++
 src/cc/meta/NamespaceV2.h                     |  457 +++
 src/cc/meta/NetDispatch.cc                    |  105 +-
 src/cc/meta/NetDispatch.h                     |    3 +
 src/cc/meta/Replay.cc                         |  189 ++
 src/cc/meta/Replay.h                          |    1 +
 src/cc/meta/Restorer.cc                       |  161 +
 src/cc/meta/namespacev2bench_main.cc          |  367 ++
 src/cc/meta/namespacev2test_main.cc           |  886 +++++
 src/cc/meta/namespacev2walreplaytest_main.cc  |  169 +
 src/cc/qcdio/QCThread.cc                      |    7 +-
 src/cc/tools/qfsput_main.cc                   |    9 +-
 46 files changed, 8464 insertions(+), 161 deletions(-)
 create mode 100755 clean_start_cluster.sh
 create mode 100644 docs/rfc/RFC-0002-hdfs-like-write-flow.md
 create mode 100644 docs/rfc/RFC-0003-write-path-optimization-summary.md
 create mode 100755 press.sh
 create mode 100644 src/cc/meta/NamespaceV2.cc
 create mode 100644 src/cc/meta/NamespaceV2.h
 create mode 100644 src/cc/meta/namespacev2bench_main.cc
 create mode 100644 src/cc/meta/namespacev2test_main.cc
 create mode 100644 src/cc/meta/namespacev2walreplaytest_main.cc

diff --git a/benchmarks/mstress/mstress_client.cc b/benchmarks/mstress/mstress_client.cc
index 96a7c1620..87d69ab3f 100644
--- a/benchmarks/mstress/mstress_client.cc
+++ b/benchmarks/mstress/mstress_client.cc
@@ -37,6 +37,8 @@
 #include <vector>
 #include <algorithm>
 #include <deque>
+#include <stdint.h>
+#include <fcntl.h>
 
 #if __cplusplus >= 201103L
 #include <random>
@@ -45,12 +47,40 @@
 using namespace std;
 
 #include "libclient/KfsClient.h"
+#include "common/Properties.h"
 
 FILE* logFile = stdout;
 
 #define TEST_BASE_DIR "/mstress"
 #define COUNT_INCR 500
 
+struct WriteTimingStats {
+  int64_t openUsec;
+  int64_t writeUsec;
+  int64_t closeUsec;
+  int64_t openCount;
+  int64_t writeCount;
+  int64_t closeCount;
+
+  WriteTimingStats()
+    : openUsec(0),
+      writeUsec(0),
+      closeUsec(0),
+      openCount(0),
+      writeCount(0),
+      closeCount(0)
+    {}
+};
+
+static WriteTimingStats gWriteTimingStats;
+
+static int64_t NowUsec()
+{
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return (int64_t)tv.tv_sec * 1000000 + (int64_t)tv.tv_usec;
+}
+
 /*
   This program is invoked with the following arguments:
     - qfs server/port
@@ -139,6 +169,7 @@ struct Client {
   int levels_;
   int inodesPerLevel_;
   int pathsToStat_;
+  int64_t fileSize_;
 };
 const size_t Client::INITIAL_SIZE = 1 << 12;
 
@@ -148,7 +179,17 @@ class AutoCleanupKfsClient
 public:
   AutoCleanupKfsClient(Client* client) : initialized(false)
   {
-    kfsClient = KFS::Connect(client->dfsServer_, client->dfsPort_);
+    const char* const config = getenv("QFS_CLIENT_CONFIG");
+    if (config && config[0]) {
+      KFS::Properties props;
+      if (props.loadProperties(config, '=') == 0) {
+        kfsClient = KFS::Connect(client->dfsServer_, client->dfsPort_, &props);
+      } else {
+        kfsClient = 0;
+      }
+    } else {
+      kfsClient = KFS::Connect(client->dfsServer_, client->dfsPort_);
+    }
     if (kfsClient) {
       initialized = true;
     }
@@ -190,6 +231,23 @@ void myitoa(int n, char* buf, size_t len = 32)
   snprintf(buf, len, "%d", n);
 }
 
+static void DumpQfsClientStats(KFS::KfsClient* kfsClient, const char* tag)
+{
+  if (! kfsClient) {
+    return;
+  }
+  KFS::Properties* const stats = kfsClient->GetStats();
+  if (! stats) {
+    return;
+  }
+  fprintf(logFile, "\n=== qfs_client stats (%s) ===\n", (tag ? tag : ""));
+  for (KFS::Properties::iterator it = stats->begin(); it != stats->end(); ++it) {
+    fprintf(logFile, "%s=%s\n", it->first.c_str(), it->second.c_str());
+  }
+  fprintf(logFile, "=== end qfs_client stats ===\n\n");
+  delete stats;
+}
+
 //Return a random permutation of numbers in [0..range).
 void unique_random(vector<size_t>& result, size_t range)
 {
@@ -292,6 +350,7 @@ void ParsePlanFile(Client* client)
 {
   string line;
   ifstream ifs(client->planfilePath_.c_str(), ifstream::in);
+  client->fileSize_ = 0;
 
   while (ifs.good()) {
     getline(ifs, line);
@@ -314,6 +373,10 @@ void ParsePlanFile(Client* client)
       client->pathsToStat_ = atoi(line.substr(6).c_str());
       continue;
     }
+    if (line.substr(0, 9) == "filesize=") {
+      client->fileSize_ = atoll(line.substr(9).c_str());
+      continue;
+    }
   }
   ifs.close();
   if (client->levels_ <= 0 || client->inodesPerLevel_ <= 0 || client->type_.empty()) {
@@ -355,6 +418,7 @@ int CreateDFSPaths(Client* client, AutoCleanupKfsClient* kfs, int level, int* cr
       (*createdCount)++;
       if (*createdCount > 0 && (*createdCount) % COUNT_INCR == 0) {
         fprintf(logFile, "Created paths so far: %d\n", *createdCount);
+        fflush(logFile);
       }
       if (!isLeaf) {
         rc = CreateDFSPaths(client, kfs, level+1, createdCount);
@@ -365,14 +429,56 @@ int CreateDFSPaths(Client* client, AutoCleanupKfsClient* kfs, int level, int* cr
       }
     } else {
       //fprintf(logFile, "Creating file [%s]\n", client->path_.actualPath_);
-      rc = kfsClient->Create(client->path_.String());
+      const int64_t openStartUsec = NowUsec();
+      rc = client->fileSize_ > 0 ?
+        kfsClient->Open(client->path_.String(), O_CREAT|O_RDWR) :
+        kfsClient->Create(client->path_.String());
+      if (client->fileSize_ > 0) {
+        gWriteTimingStats.openUsec += NowUsec() - openStartUsec;
+        gWriteTimingStats.openCount++;
+      }
       if (rc < 0) {
         fprintf(logFile, "Create(%s) failed with rc=%d\n", client->path_.String(), rc);
         return rc;
       }
+      if (client->fileSize_ > 0) {
+        static const size_t kWriteBufSize = 1 << 20;
+        if (*createdCount == 0) {
+          fprintf(logFile, "Writing %lld bytes per file...\n",
+            (long long)client->fileSize_);
+          fflush(logFile);
+        }
+        static vector<char> sWriteBuf(kWriteBufSize, 'x');
+        int64_t remaining = client->fileSize_;
+        while (remaining > 0) {
+          const size_t len = (remaining < (int64_t)kWriteBufSize) ?
+            (size_t)remaining : kWriteBufSize;
+          const int64_t writeStartUsec = NowUsec();
+          const ssize_t wr = kfsClient->Write(rc, &sWriteBuf[0], len);
+          gWriteTimingStats.writeUsec += NowUsec() - writeStartUsec;
+          gWriteTimingStats.writeCount++;
+          if (wr != (ssize_t)len) {
+            fprintf(logFile, "Write(%s) failed expected=%zu actual=%ld\n",
+              client->path_.String(), len, (long)wr);
+            kfsClient->Close(rc);
+            return (wr < 0 ? (int)wr : -EIO);
+          }
+          remaining -= wr;
+        }
+        const int64_t closeStartUsec = NowUsec();
+        const int closeErr = kfsClient->Close(rc);
+        gWriteTimingStats.closeUsec += NowUsec() - closeStartUsec;
+        gWriteTimingStats.closeCount++;
+        if (closeErr < 0) {
+          fprintf(logFile, "Close(%s) failed with rc=%d\n",
+            client->path_.String(), closeErr);
+          return closeErr;
+        }
+      }
       (*createdCount)++;
       if (*createdCount > 0 && (*createdCount) % COUNT_INCR == 0) {
         fprintf(logFile, "Created paths so far: %d\n", *createdCount);
+        fflush(logFile);
       }
     }
     client->path_.Pop(name);
@@ -407,6 +513,32 @@ int CreateDFSPaths(Client* client, AutoCleanupKfsClient* kfs)
   struct timeval tvZigma;
   gettimeofday(&tvZigma, NULL);
   fprintf(logFile, "Client: %d paths created in %ld msec\n", createdCount, TimeDiffMilliSec(&tvAlpha, &tvZigma));
+  fflush(logFile);
+  if (client->fileSize_ > 0) {
+    const long totalMsec = TimeDiffMilliSec(&tvAlpha, &tvZigma);
+    fprintf(logFile, "Client: %lld bytes written in %ld msec\n",
+      (long long)createdCount * (long long)client->fileSize_,
+      totalMsec);
+    fprintf(logFile,
+      "Client write timing: open count=%lld total=%lld usec avg=%lld usec\n",
+      (long long)gWriteTimingStats.openCount,
+      (long long)gWriteTimingStats.openUsec,
+      (long long)(gWriteTimingStats.openCount ?
+        gWriteTimingStats.openUsec / gWriteTimingStats.openCount : 0));
+    fprintf(logFile,
+      "Client write timing: write count=%lld total=%lld usec avg=%lld usec\n",
+      (long long)gWriteTimingStats.writeCount,
+      (long long)gWriteTimingStats.writeUsec,
+      (long long)(gWriteTimingStats.writeCount ?
+        gWriteTimingStats.writeUsec / gWriteTimingStats.writeCount : 0));
+    fprintf(logFile,
+      "Client write timing: close count=%lld total=%lld usec avg=%lld usec\n",
+      (long long)gWriteTimingStats.closeCount,
+      (long long)gWriteTimingStats.closeUsec,
+      (long long)(gWriteTimingStats.closeCount ?
+        gWriteTimingStats.closeUsec / gWriteTimingStats.closeCount : 0));
+    fflush(logFile);
+  }
   return 0;
 }
 
@@ -446,6 +578,7 @@ int StatDFSPaths(Client* client, AutoCleanupKfsClient* kfs) {
 
     if (count > 0 && count % COUNT_INCR == 0) {
       fprintf(logFile, "Stat paths so far: %d\n", count);
+      fflush(logFile);
     }
   }
 
@@ -496,6 +629,7 @@ int ListDFSPaths(Client* client, AutoCleanupKfsClient* kfs) {
       children.pop_back();
       if (inodeCount > 0 && inodeCount % COUNT_INCR == 0) {
         fprintf(logFile, "Readdir paths so far: %d\n", inodeCount);
+        fflush(logFile);
       }
     }
   }
@@ -588,6 +722,7 @@ int RemoveDFSPaths(Client* client, AutoCleanupKfsClient* kfs) {
 int main(int argc, char* argv[])
 {
   Client client;
+  setvbuf(logFile, NULL, _IOLBF, 0);
 
   parse_options(argc, argv, &client);
 
@@ -612,6 +747,7 @@ int main(int argc, char* argv[])
     fprintf(logFile, "Error: unrecognized test '%s'", client.testName_.c_str());
     return -1;
   }
+  DumpQfsClientStats(kfs.GetClient(), client.testName_.c_str());
   return result;
 }
 
diff --git a/clean_start_cluster.sh b/clean_start_cluster.sh
new file mode 100755
index 000000000..228b86ce7
--- /dev/null
+++ b/clean_start_cluster.sh
@@ -0,0 +1,49 @@
+ pkill -f 'mstress_100k_1m_file'
+  pkill -f 'mstress_client'
+  pkill -f 'metaserver'
+  pkill -f 'chunkserver'
+
+# clean cluster
+
+  cd /work/bigo-qfs
+  TS=$(date +%Y%m%d_%H%M%S)
+
+  rm -f qfsbase/meta/metaserver.pid \
+        qfsbase/chunk1/chunkserver.pid \
+        qfsbase/chunk2/chunkserver.pid \
+        qfsbase/chunk3/chunkserver.pid
+
+  mv qfsbase/meta/logs qfsbase/meta/logs.bak.$TS 2>/dev/null || true
+  mv qfsbase/meta/checkpoints qfsbase/meta/checkpoints.bak.$TS 2>/dev/null || true
+  mkdir -p qfsbase/meta/logs qfsbase/meta/checkpoints
+
+  for p in \
+    qfsbase/chunk1/chunkdir11 \
+    qfsbase/chunk1/chunkdir12 \
+    qfsbase/chunk2/chunkdir21 \
+    qfsbase/chunk3/chunkdir31
+  do
+    mv "$p" "$p.bak.$TS" 2>/dev/null || true
+    mkdir -p "$p"
+  done
+
+# start meta
+  cd /work/bigo-qfs
+
+  bld/output/bin/metaserver -c qfsbase/meta/conf/MetaServer.prp qfsbase/meta/MetaServer.log
+
+  setsid -f bld/output/bin/metaserver \
+    qfsbase/meta/conf/MetaServer.prp \
+    qfsbase/meta/MetaServer.log \
+    >> qfsbase/meta/MetaServer.out 2>&1
+
+# start chunk
+ cd /work/bigo-qfs
+
+  setsid -f bld/output/bin/chunkserver qfsbase/chunk1/conf/ChunkServer.prp qfsbase/chunk1/ChunkServer.log > qfsbase/chunk1/ChunkServer.out 2>&1
+  setsid -f bld/output/bin/chunkserver qfsbase/chunk2/conf/ChunkServer.prp qfsbase/chunk2/ChunkServer.log > qfsbase/chunk2/ChunkServer.out 2>&1
+  setsid -f bld/output/bin/chunkserver qfsbase/chunk3/conf/ChunkServer.prp qfsbase/chunk3/ChunkServer.log > qfsbase/chunk3/ChunkServer.out 2>&1
+
+sleep 6
+
+bld/output/bin/tools/qfsping -m -s 202.168.115.34 -p 20000
diff --git a/docs/rfc/RFC-0002-hdfs-like-write-flow.md b/docs/rfc/RFC-0002-hdfs-like-write-flow.md
new file mode 100644
index 000000000..41be84aaa
--- /dev/null
+++ b/docs/rfc/RFC-0002-hdfs-like-write-flow.md
@@ -0,0 +1,161 @@
+# RFC-0002: HDFS-like Write Flow for QFS
+
+## Summary
+
+This plan introduces an optional HDFS-like write allocation path for QFS.
+The goal is to reduce small-file write latency by removing the synchronous
+metaserver-to-chunkserver pre-create step from the normal replicated write
+allocation path.
+
+The existing QFS write flow is preserved by default. The new path is enabled
+only when both metaserver and chunkserver switches are turned on.
+
+## Current QFS Write Flow
+
+For a normal replicated write, the current QFS path is:
+
+1. Client creates or opens a file and obtains a file id.
+2. Client write enters `Writer`.
+3. `Writer` sends `ALLOCATE` to metaserver when it needs a chunk.
+4. Metaserver selects chunkservers, creates metadata, grants a write lease.
+5. Metaserver sends `ALLOCATE_CHUNK` to chunkserver before replying to client.
+6. `ALLOCATE_CHUNK` is logged as an in-flight metaserver-to-chunkserver op.
+7. Chunkserver creates the local chunk and registers the lease.
+8. Client receives allocation result.
+9. Client sends `WRITE_ID_ALLOC` to chunkserver.
+10. Client sends `WRITE_PREPARE` / `WRITE_SYNC` data RPCs.
+11. Client close / sync waits for pending writes and sends `CLOSE_CHUNK`.
+
+The hot part for small-file writes is step 5-7. In benchmark investigation,
+the pre-RPC in-flight logging before `ALLOCATE_CHUNK` dominated allocation
+latency.
+
+## Target HDFS-like Compatible Flow
+
+With the new optional path:
+
+1. Client creates or opens a file and obtains a file id.
+2. Client write enters `Writer`.
+3. `Writer` sends `ALLOCATE` to metaserver when it needs a chunk.
+4. Metaserver selects chunkservers, creates metadata, grants a write lease.
+5. Metaserver replies to client without sending `ALLOCATE_CHUNK`.
+6. Client sends `WRITE_ID_ALLOC` with `File-handle` and `Lease-id`.
+7. Chunkserver lazily creates the chunk if it does not exist.
+8. Chunkserver registers the write lease locally.
+9. Client sends `WRITE_PREPARE` / `WRITE_SYNC` data RPCs.
+10. Client close / sync waits for pending writes and sends `CLOSE_CHUNK`.
+
+This keeps the existing QFS client write protocol mostly intact while moving
+chunk creation from metaserver-driven pre-create to client-write-driven lazy
+creation.
+
+## Config Switches
+
+Both switches must be enabled:
+
+```properties
+metaServer.writeFlow.hdfsLikeAllocate = 1
+chunkServer.writeFlow.lazyCreateOnWrite = 1
+```
+
+Default behavior remains the original QFS path:
+
+```properties
+metaServer.writeFlow.hdfsLikeAllocate = 0
+chunkServer.writeFlow.lazyCreateOnWrite = 0
+```
+
+## Implemented Changes
+
+Implemented in this branch:
+
+- Metaserver:
+  - Added `metaServer.writeFlow.hdfsLikeAllocate`.
+  - For normal replicated non-append, non-striped, non-object-store allocation,
+    metaserver can skip chunkserver pre-create.
+  - The allocation request is completed immediately through `LayoutDone()`.
+  - `MetaAllocate` response now includes `Lease-id` when available.
+
+- Client:
+  - `AllocateOp` now parses `Lease-id`.
+  - `WRITE_ID_ALLOC` now carries optional `File-handle` and `Lease-id`.
+  - `Writer` passes file id and lease id to `WRITE_ID_ALLOC`.
+  - `WriteAppender` also passes these fields, but append remains excluded from
+    the HDFS-like metaserver bypass path.
+
+- Chunkserver:
+  - Added `chunkServer.writeFlow.lazyCreateOnWrite`.
+  - `WRITE_ID_ALLOC` parses optional `File-handle` and `Lease-id`.
+  - If lazy-create is enabled and a normal write targets a missing chunk,
+    chunkserver creates the chunk in `AllocateWriteId()`.
+  - The write lease is registered locally from the lease id carried by the
+    client before the normal lease validation.
+
+## Not Fully Done
+
+This is not yet a complete production-grade HDFS write-flow replacement.
+
+- Client-CS auth and synchronous replication token semantics are not fully
+  reworked for lazy creation.
+- Append, object-store, and striped files intentionally remain on the original
+  QFS allocation path.
+- Crash-recovery behavior has not yet been validated with kill/restart tests.
+- The current implementation still includes prior timing instrumentation in
+  the write path; decide later whether to keep or clean it up.
+- The nested `MetaLogChunkAllocate` path now explicitly schedules a log flush
+  after enqueue; without this, HDFS-like allocate skipped chunkserver wait but
+  still waited for the metaserver log writer timeout cadence.
+
+## Test Plan
+
+Minimum validation before treating this as stable:
+
+1. Build:
+   - `cmake --build bld --target metaserver chunkserver qfsput mstress_client -j8`
+2. Unit test:
+   - `./bld/output/bin/devtools/namespacev2test`
+3. Functional write test:
+   - Enable both switches.
+   - Clean cluster and restart metaserver/chunkservers.
+   - Write a 1MB file with `qfsput`.
+   - Verify `qfs -ls` reports the expected size.
+   - Read the file back and verify byte count/content.
+4. Benchmark:
+   - Run small-file write benchmark with old path.
+   - Run the same benchmark with HDFS-like path enabled.
+   - Compare total time, `Write.AllocateUsec`, close latency, and chunkserver
+     lazy-create failures.
+5. Failure tests:
+   - Kill client after metaserver allocation but before write.
+   - Kill chunkserver after lazy create but before close.
+   - Restart and verify `HELLO` / `AVAILABLE_CHUNK` convergence.
+
+## Current Verification Status
+
+Completed:
+
+- Build passed for `metaserver`, `chunkserver`, `qfsput`, and `mstress_client`.
+- `namespacev2test` passed.
+- `git diff --check` passed.
+- Clean-cluster functional 1MB `qfsput` passed: `qfs -ls` reported 1048576 bytes.
+- HDFS-like lazy-create path was verified in chunkserver logs.
+- 50 x 1MB write probe after fixing nested log flush:
+  - Total time: 366 ms.
+  - Previous HDFS-like run before the flush fix: 50454 ms.
+  - `Write.AllocateUsec`: 7587 usec total for 49 allocations, down from about 49048728 usec.
+  - Client close average: 6668 usec, down from about 1008198 usec.
+- 1000 x 1MB write probe after the flush fix:
+  - Total time: 18265 ms.
+  - `Write.AllocateUsec`: 159286 usec total for 999 allocations.
+  - `Write.ChunkWriteUsec`: 16222757 usec total; the remaining dominant cost is chunk write/close, not metaserver allocate.
+
+Crash/restart validation:
+
+- Completed-file full restart passed: wrote `/recovery/ok_8m`, restarted metaserver and all chunkservers without cleaning logs or chunk dirs, then `qfscat` readback matched the original sha256.
+- Interrupted writer restart exposed a correctness gap: killing a large `qfsput` left `/recovery/killed_stream` with a namespace size beyond a chunk that was lazy-created and written but not made stable. After restart, chunkservers deleted that dirty chunk as stale, while metaserver still had the chunk mapping; reading failed at offset 536870912 with `no replicas available chunk: 131084`.
+- A simple attempt to create lazy chunks as initially stable was rejected by the existing write path (`WRITE_ID_ALLOC` returned `chunk stable`), so this needs a real recovery design rather than a shortcut.
+
+Pending:
+
+- Add proper client-crash recovery semantics for HDFS-like lazy-created chunks. Candidate fixes: lease recovery that makes the last dirty chunk stable, or metaserver-side truncation/mapping cleanup for chunks that never become stable.
+- Full 100k-file benchmark if needed; the short and medium probes already verify the 1s allocate stall is fixed.
diff --git a/docs/rfc/RFC-0003-write-path-optimization-summary.md b/docs/rfc/RFC-0003-write-path-optimization-summary.md
new file mode 100644
index 000000000..ac7232476
--- /dev/null
+++ b/docs/rfc/RFC-0003-write-path-optimization-summary.md
@@ -0,0 +1,217 @@
+# RFC-0003: QFS Write Path Optimization Summary
+
+## Summary
+
+This document summarizes the write-path optimization work on branch
+`lock-opt` and the proposed plan for preparing upstream pull requests.
+
+The overall direction is to reduce small-file create/write latency by:
+
+- reducing metaserver namespace lock contention,
+- removing synchronous chunkserver pre-create from the hot allocation path,
+- reusing chunkserver connections,
+- replacing chain replication in the client write path with client-side fanout,
+- avoiding duplicate checksum scans on every chunkserver replica,
+- reducing avoidable buffer copies in fanout.
+
+The current benchmark focus is replicated 1 MB file creation with three
+chunkservers.
+
+## Implemented Optimizations
+
+### Namespace / Metaserver
+
+- Added the NamespaceV2 implementation and tests.
+- Reworked metadata operations toward finer-grained locking instead of a single
+  coarse global namespace lock.
+- Added write transaction / WAL work needed by the v2 namespace path.
+- Added batch apply / commit optimizations for high-frequency small
+  transactions.
+- Added recovery validation tests for NamespaceV2 WAL replay.
+
+### HDFS-like Write Allocation
+
+- Added an optional HDFS-like allocation path:
+  - `metaServer.writeFlow.hdfsLikeAllocate`
+  - `chunkServer.writeFlow.lazyCreateOnWrite`
+- The metaserver can allocate chunk metadata and return the lease to the client
+  without synchronously sending `ALLOCATE_CHUNK` to chunkservers.
+- The chunkserver lazily creates the chunk on `WRITE_ID_ALLOC` when enabled.
+- The client passes file id and lease id to `WRITE_ID_ALLOC`.
+- Crash/restart validation found the expected incomplete-write gap and added a
+  recovery direction: truncate EOF to the last recoverable chunk instead of
+  trusting a namespace size that points past available stable replicas.
+
+### Client Chunkserver Connection Reuse
+
+- Added a chunkserver client pool for the write path.
+- `mstress_client` now honors `QFS_CLIENT_CONFIG`, so benchmark runs can use the
+  same client config as normal tools.
+- This removed connection churn from short-file write tests:
+  - `ChunkServer.Pool.Connect=3`
+  - `ChunkServer.Pool.OpsQueued=9000` for 1000 files with three fanout RPC
+    stages.
+
+### Parallel Replica Write Fanout
+
+- Added `client.parallelReplicaWrite` and enabled it by default in the test
+  configuration.
+- Added `No-forward` / `NF` support for:
+  - `WRITE_ID_ALLOC`
+  - `WRITE_PREPARE`
+  - `WRITE_SYNC`
+  - `CLOSE`
+- Client now sends write-id allocation, write prepare, and close RPCs directly
+  to all replicas instead of relying on chunkserver-to-chunkserver forwarding.
+- The request still carries the full replica list so each chunkserver can derive
+  its own replica position and write id.
+
+### Fanout Buffer Sharing
+
+- Verified that `IOBufferData` already uses a ref-counted data block.
+- Added `IOBuffer::AppendShared()` so fanout requests attach shared buffer
+  references directly.
+- `Writer` now uses `AppendShared()` instead of creating a temporary cloned
+  `IOBuffer` for each replica fanout request.
+- Payload bytes are not copied for fanout; each replica request holds a shared
+  reference and the data is released when the last reference drops.
+
+### Checksum Hot Path
+
+- Client now sends the 64 KB checksum vector in `WRITE_PREPARE` reply mode.
+- Chunkserver can reuse the client-provided checksum vector for chunk metadata.
+- With `chunkServer.skipWritePrepareChecksumVerify=1`, chunkserver skips the
+  duplicate payload scan in the write hot path and trusts the client-provided
+  checksum vector.
+- Short RPC checksum vector output was fixed to preserve hex formatting for
+  subsequent short-format fields.
+
+This matches the HDFS-style tradeoff more closely: clients provide packet /
+chunk checksums, datanodes store them, and later reads or scrubs verify stored
+data against those checksums.
+
+## Latest Benchmark Snapshot
+
+Environment:
+
+- three local chunkservers,
+- client and chunkservers use `202.168.115.34` instead of `localhost` to force
+  traffic through the network path,
+- three replicas,
+- 1 MB files,
+- `client.parallelReplicaWrite=1`,
+- `chunkServer.skipWritePrepareChecksumVerify=1`.
+
+### Single Client
+
+Plan: 1 client, 1000 files, 1 MB per file.
+
+```text
+1000 files created in 3058 ms
+
+open  avg: 240 us
+write avg: 134 us
+close avg: 2681 us
+
+Write.ChunkWriteUsec: 1735099 us
+Write.CloseUsec:      2655539 us
+Write.WriteIdAlloc:   225061 us
+Write.ChunkClose:     114009 us
+
+ChunkServer.Pool.BytesSent: 3147979791
+ChunkServer.Pool.Connect:   3
+ChunkServer.Pool.OpsQueued: 9000
+```
+
+Approximate throughput:
+
+- logical write throughput: about 327 MB/s,
+- actual client network send: about 1.03 GB/s because every 1 MB file is sent
+  to three replicas.
+
+### Two Clients
+
+Plan: 2 clients, 1000 files per client, 1 MB per file.
+
+Before checksum-vector / skip-verify optimization:
+
+```text
+proc_00: 4353 ms, Write.ChunkWriteUsec=2865775
+proc_01: 4349 ms, Write.ChunkWriteUsec=2863192
+```
+
+After checksum-vector / skip-verify optimization:
+
+```text
+proc_00: 4171 ms, Write.ChunkWriteUsec=2414194
+proc_01: 4209 ms, Write.ChunkWriteUsec=2435554
+```
+
+`ChunkWriteUsec` dropped by about 14-16%. Total time dropped by about 3-4%.
+The remaining cost is dominated by three-replica network fanout and chunk file
+write / close work.
+
+## Correctness Notes
+
+- `chunkServer.skipWritePrepareChecksumVerify=1` changes write-time checksum
+  semantics: the chunkserver trusts the client-provided checksum vector instead
+  of recomputing checksums over the received payload. This is closer to the
+  HDFS write-path tradeoff, but it should be treated as a deliberate
+  configuration choice.
+- The HDFS-like lazy-create path needs careful recovery semantics for killed
+  clients. The current direction is to truncate or repair namespace EOF to the
+  last recoverable stable chunk after restart.
+- Append, striped files, object store files, and authenticated / tokenized
+  synchronous replication paths need separate review before enabling the new
+  write flow broadly.
+
+## Upstream PR Plan
+
+The current branch contains several related but separable changes. For upstream
+review, split into smaller PRs:
+
+1. **Infrastructure / tests**
+   - NamespaceV2 tests and WAL replay tests.
+   - Benchmark client config loading through `QFS_CLIENT_CONFIG`.
+   - Minimal scripts or docs only if acceptable upstream.
+
+2. **NamespaceV2 / lock optimization**
+   - Finer-grained metadata locking.
+   - WAL / transaction correctness tests.
+   - Keep performance changes separate from protocol changes where possible.
+
+3. **HDFS-like lazy create**
+   - Config switches.
+   - Metaserver allocate bypass.
+   - Chunkserver lazy chunk creation on `WRITE_ID_ALLOC`.
+   - Recovery behavior must be completed before this is proposed as
+     production-ready.
+
+4. **Write connection reuse and fanout**
+   - Client chunkserver pool.
+   - `client.parallelReplicaWrite`.
+   - `No-forward` protocol support.
+   - Parallel `WRITE_ID_ALLOC`, `WRITE_PREPARE`, and `CLOSE`.
+
+5. **Checksum-vector hot-path optimization**
+   - Client sends block checksum vector in write-prepare reply mode.
+   - Chunkserver reuses the vector.
+   - Optional `chunkServer.skipWritePrepareChecksumVerify`.
+
+6. **Buffer sharing cleanup**
+   - `IOBuffer::AppendShared()`.
+   - Writer fanout uses shared buffer references instead of temporary clone
+     buffers.
+
+## Remaining Work
+
+- Add chunkserver-side detailed timing counters for:
+  - request parse,
+  - checksum handling,
+  - disk queue submit,
+  - disk completion latency.
+- Finish crash/restart recovery for killed writers under lazy create.
+- Re-run larger 100k-file tests after recovery semantics are finalized.
+- Run compatibility tests with short RPC disabled and enabled.
+- Run tests with `chunkServer.skipWritePrepareChecksumVerify=0` and `1` to make
+  the correctness/performance tradeoff explicit.
diff --git a/press.sh b/press.sh
new file mode 100755
index 000000000..32b2064ae
--- /dev/null
+++ b/press.sh
@@ -0,0 +1,11 @@
+  cd /work/bigo-qfs
+
+  python bld/benchmarks/mstress/mstress.py \
+    -m slave \
+    -f qfs \
+    -s localhost \
+    -p 20000 \
+    -t create \
+    -a output/mstress_100k_1m_file.plan \
+    -c localhost \
+    -k localhost
diff --git a/src/cc/chunk/ChunkManager.cc b/src/cc/chunk/ChunkManager.cc
index 9348ab8fe..146759f13 100644
--- a/src/cc/chunk/ChunkManager.cc
+++ b/src/cc/chunk/ChunkManager.cc
@@ -102,7 +102,7 @@ struct ChunkManager::ChunkDirInfo : public ITimeout
     ChunkDirInfo()
         : ITimeout(),
           dirname(),
-          bufferedIoFlag(false),
+          bufferedIoFlag(true),
           storageTier(kKfsSTierUndef),
           usedSpace(0),
           availableSpace(-1),
@@ -1792,7 +1792,7 @@ ChunkInfoHandle::Release(ChunkInfoHandle::ChunkLists* chunkInfoLists)
     if (! IsStable()) {
         UpdateDirStableCount();
     }
-    KFS_LOG_STREAM_INFO <<
+    KFS_LOG_STREAM_DEBUG <<
         "closing chunk " << chunkInfo.chunkId <<
         " version: "     << chunkInfo.chunkVersion <<
         " file handle: " << logFH <<
@@ -2126,7 +2126,7 @@ ChunkManager::ChunkManager()
       mMinPendingIoThreshold(8 << 20),
       mPlacementMaxWaitingAvgUsecsThreshold(5 * 60 * 1000 * 1000),
       mAllowSparseChunksFlag(true),
-      mBufferedIoFlag(false),
+      mBufferedIoFlag(true),
       mSyncChunkHeaderFlag(false),
       mCheckDirWritableFlag(true),
       mCheckDirTestWriteSize(16 << 10),
@@ -2156,6 +2156,8 @@ ChunkManager::ChunkManager()
       mDiskBufferManagerEnabledFlag(true),
       mForceVerifyDiskReadChecksumFlag(false),
       mWritePrepareReplyFlag(true),
+      mSkipWritePrepareChecksumVerifyFlag(false),
+      mLazyCreateOnWriteFlag(false),
       mCryptoKeys(globalNetManager(), 0),
       mFileSystemId(-1),
       mFileSystemIdSuffix(),
@@ -2577,6 +2579,12 @@ ChunkManager::SetParameters(const Properties& prop)
     mWritePrepareReplyFlag = prop.getValue(
         "chunkServer.debugTestWriteSync",
         mWritePrepareReplyFlag ? 0 : 1) == 0;
+    mSkipWritePrepareChecksumVerifyFlag = prop.getValue(
+        "chunkServer.skipWritePrepareChecksumVerify",
+        mSkipWritePrepareChecksumVerifyFlag ? 1 : 0) != 0;
+    mLazyCreateOnWriteFlag = prop.getValue(
+        "chunkServer.writeFlow.lazyCreateOnWrite",
+        mLazyCreateOnWriteFlag ? 1 : 0) != 0;
     mFsIdFileNamePrefix = prop.getValue(
         "chunkServer.fsIdFileNamePrefix", mFsIdFileNamePrefix);
     mDirCheckerIoTimeoutSec = prop.getValue(
@@ -2806,7 +2814,8 @@ ChunkManager::SetBufferedIo(const Properties& props)
                 break;
             }
         }
-        const bool bufferedIoFlag = pit != prefixes.end();
+        const bool bufferedIoFlag =
+            mBufferedIoFlag || (pit != prefixes.end());
         if (bufferedIoFlag != it->bufferedIoFlag) {
             it->bufferedIoFlag = bufferedIoFlag;
             if (it->availableSpace < 0 && ! it->dirLock) {
@@ -3176,7 +3185,7 @@ ChunkManager::AllocChunk(
         cih->Delete(mChunkInfoLists);
         return -EFAULT;
     }
-    KFS_LOG_STREAM_INFO << "creating chunk: " << MakeChunkPathname(cih) <<
+    KFS_LOG_STREAM_DEBUG << "creating chunk: " << MakeChunkPathname(cih) <<
     KFS_LOG_EOM;
     int ret = OpenChunk(cih, O_RDWR | O_CREAT);
     if (ret < 0) {
@@ -3320,8 +3329,17 @@ ChunkManager::MakeChunkStable(kfsChunkId_t chunkId, kfsSeq_t chunkVersion,
             return -EINVAL;
         }
     } else if (chunkVersion != cih->chunkInfo.chunkVersion) {
-        statusMsg = "version mismatch";
-        return -EINVAL;
+        if (! (mLazyCreateOnWriteFlag && ! appendFlag && ! cih->IsStable() &&
+                cih->chunkInfo.chunkVersion == 0 && chunkVersion > 0)) {
+            statusMsg = "version mismatch";
+            return -EINVAL;
+        }
+        KFS_LOG_STREAM_INFO <<
+            "make stable lazy dirty chunk version remap:"
+            " chunk: " << chunkId <<
+            " local: " << cih->chunkInfo.chunkVersion <<
+            " target: " << chunkVersion <<
+        KFS_LOG_EOM;
     }
     if (cih->IsBeingReplicated()) {
         statusMsg = "chunk replication is in progress";
@@ -3598,9 +3616,21 @@ ChunkManager::ReadChunkMetadataDone(ReadChunkMetaOp* op, IOBuffer* dataBuf)
                     " " << op->Show() <<
                 KFS_LOG_EOM;
             } else {
+                const int64_t lazyDirtyRecoveredSize =
+                    (mLazyCreateOnWriteFlag && ! cih->IsStable() &&
+                        cih->chunkInfo.chunkVersion == 0 &&
+                        dci.chunkSize == 0 && cih->chunkInfo.chunkSize > 0) ?
+                    cih->chunkInfo.chunkSize : int64_t(-1);
                 cih->chunkInfo.SetChecksums(dci);
                 cih->chunkInfo.chunkFlags = dci.flags;
-                if (cih->chunkInfo.chunkSize > (int64_t)dci.chunkSize) {
+                if (0 <= lazyDirtyRecoveredSize) {
+                    KFS_LOG_STREAM_INFO <<
+                        "using dirty lazy-created chunk file size:"
+                        " chunk: " << cih->chunkInfo.chunkId <<
+                        " size: "  << lazyDirtyRecoveredSize <<
+                    KFS_LOG_EOM;
+                    cih->chunkInfo.chunkSize = lazyDirtyRecoveredSize;
+                } else if (cih->chunkInfo.chunkSize > (int64_t)dci.chunkSize) {
                     const int64_t extra =
                         cih->chunkInfo.chunkSize - dci.chunkSize;
                     if (0 <= cih->chunkInfo.chunkVersion) {
@@ -4442,7 +4472,7 @@ ChunkManager::OpenChunk(ChunkInfoHandle* cih, int openFlags)
         cih->UpdateDirStableCount();
     }
     KFS_LOG_STREAM(openFlag ?
-            MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelINFO) <<
+            MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelDEBUG) <<
         (openFlag ? "open" : "create") <<
         " chunk file: "  << fn <<
         " file handle: " << reinterpret_cast<const void*>(cih->dataFH.get()) <<
@@ -4540,7 +4570,7 @@ ChunkManager::CloseChunk(ChunkInfoHandle* cih, KfsOp* op /* = 0 */)
             ! cih->SyncMeta()) {
         Release(*cih);
     } else {
-        KFS_LOG_STREAM_INFO <<
+        KFS_LOG_STREAM_DEBUG <<
             "chunk: " << cih->chunkInfo.chunkId <<
             " version: " << cih->chunkInfo.chunkVersion <<
             " not released on close; might give up lease" <<
@@ -5659,13 +5689,37 @@ ChunkManager::RemoveDirtyChunks()
                     fileSystemId,
                     ioTimeSec,
                     readFlag)) {
-                const bool kStableFlag      = false;
-                const bool kForceDeleteFlag = true;
-                ScheduleCleanup(
-                    *it, fileId, chunkId, chunkVers,
-                    (int64_t)buf.st_size - (int64_t)KFS_CHUNK_HEADER_SIZE,
-                    kStableFlag, kForceDeleteFlag);
-                InsertLastInFlight(chunkId);
+                const int64_t dataSize =
+                    (int64_t)buf.st_size - (int64_t)KFS_CHUNK_HEADER_SIZE;
+                if (mLazyCreateOnWriteFlag) {
+                    const bool kStableFlag = false;
+                    ChunkInfoHandle* const cih =
+                        new ChunkInfoHandle(*it, kStableFlag);
+                    cih->chunkInfo.fileId       = fileId;
+                    cih->chunkInfo.chunkId      = chunkId;
+                    cih->chunkInfo.chunkVersion = chunkVers;
+                    cih->chunkInfo.chunkSize    = max(int64_t(0), dataSize);
+                    if (AddMapping(cih) == cih) {
+                        KFS_LOG_STREAM_INFO <<
+                            "preserving dirty lazy-created chunk:"
+                            " file: "    << fileId <<
+                            " chunk: "   << chunkId <<
+                            " version: " << chunkVers <<
+                            " size: "    << cih->chunkInfo.chunkSize <<
+                        KFS_LOG_EOM;
+                    } else {
+                        const bool kForceDeleteFlag = true;
+                        const bool kEvacuatedFlag   = false;
+                        MakeStale(*cih, kForceDeleteFlag, kEvacuatedFlag);
+                    }
+                } else {
+                    const bool kStableFlag      = false;
+                    const bool kForceDeleteFlag = true;
+                    ScheduleCleanup(
+                        *it, fileId, chunkId, chunkVers, dataSize,
+                        kStableFlag, kForceDeleteFlag);
+                    InsertLastInFlight(chunkId);
+                }
             } else {
                 KFS_LOG_STREAM_INFO <<
                     "cleaning out dirty chunk: " << name <<
@@ -6340,9 +6394,37 @@ ChunkManager::AllocateWriteId(
     const ServerLocation& peerLoc)
 {
     const bool kAddObjectBlockMappingFlag = false;
-    ChunkInfoHandle* const cih = GetChunkInfoHandle(
+    ChunkInfoHandle* cih = GetChunkInfoHandle(
         wi->chunkId, wi->chunkVersion, kAddObjectBlockMappingFlag);
-    if (! cih) {
+    if (! cih && mLazyCreateOnWriteFlag && ! wi->isForRecordAppend &&
+            0 <= wi->fileId && 0 <= wi->chunkVersion) {
+        const bool kIsBeingReplicatedFlag = false;
+        const bool kMustExistFlag = false;
+        const int ret = AllocChunk(
+            wi->fileId,
+            wi->chunkId,
+            wi->chunkVersion,
+            kKfsSTierUndef,
+            kKfsSTierUndef,
+            kIsBeingReplicatedFlag,
+            &cih,
+            kMustExistFlag
+        );
+        if (ret < 0) {
+            wi->statusMsg = "lazy chunk create failed";
+            wi->status = ret;
+        } else {
+            wi->lazyChunkCreatedFlag = true;
+            KFS_LOG_STREAM_DEBUG <<
+                "lazy chunk create:"
+                " file: "    << wi->fileId <<
+                " chunk: "   << wi->chunkId <<
+                " version: " << wi->chunkVersion <<
+            KFS_LOG_EOM;
+        }
+    }
+    if (0 != wi->status) {
+    } else if (! cih) {
         wi->statusMsg = "no such chunk";
         wi->status = -EBADF;
     } else if (wi->chunkVersion != cih->chunkInfo.chunkVersion) {
diff --git a/src/cc/chunk/ChunkManager.h b/src/cc/chunk/ChunkManager.h
index 7bfdaaebd..f55faf3c4 100644
--- a/src/cc/chunk/ChunkManager.h
+++ b/src/cc/chunk/ChunkManager.h
@@ -120,6 +120,10 @@ class ChunkManager : private ITimeout {
     };
 
     bool SetParameters(const Properties& prop);
+    bool IsLazyCreateOnWriteEnabled() const
+        { return mLazyCreateOnWriteFlag; }
+    bool IsWritePrepareChecksumVerifySkipped() const
+        { return mSkipWritePrepareChecksumVerifyFlag; }
     /// Init function to configure the chunk manager object.
     bool Init(const vector<string>& chunkDirs, const Properties& prop);
 
@@ -962,6 +966,8 @@ class ChunkManager : private ITimeout {
     bool       mDiskBufferManagerEnabledFlag;
     bool       mForceVerifyDiskReadChecksumFlag;
     bool       mWritePrepareReplyFlag;
+    bool       mSkipWritePrepareChecksumVerifyFlag;
+    bool       mLazyCreateOnWriteFlag;
     CryptoKeys mCryptoKeys;
     int64_t    mFileSystemId;
     string     mFileSystemIdSuffix;
diff --git a/src/cc/chunk/ClientSM.cc b/src/cc/chunk/ClientSM.cc
index 6f9e0bc9c..c8469fea0 100644
--- a/src/cc/chunk/ClientSM.cc
+++ b/src/cc/chunk/ClientSM.cc
@@ -753,7 +753,8 @@ ClientSM::GetWriteOp(KfsOp& op, int align, int numBytes,
     }
     if (nAvail < numBytes) {
         mNetConnection->SetMaxReadAhead(numBytes - nAvail);
-        SetReceiveContent(numBytes, op.op == CMD_WRITE_PREPARE);
+        SetReceiveContent(numBytes, op.op == CMD_WRITE_PREPARE &&
+            ! gChunkManager.IsWritePrepareChecksumVerifySkipped());
         // we couldn't process the command...so, wait
         return false;
     }
@@ -944,7 +945,9 @@ ClientSM::HandleClientCmd(IOBuffer& iobuf, int inCmdLen)
         bufferBytes = 0 <= op->status ? IoRequestBytes(wop->numBytes) : 0;
         if (GetReceiveByteCount() == (int)wop->numBytes) {
             wop->receivedChecksum = GetChecksum();
-            wop->blocksChecksums.swap(GetBlockChecksums());
+            if (! gChunkManager.IsWritePrepareChecksumVerifySkipped()) {
+                wop->blocksChecksums.swap(GetBlockChecksums());
+            }
         }
         ReceiveClear();
     } else if (op->op == CMD_RECORD_APPEND) {
diff --git a/src/cc/chunk/KfsOps.cc b/src/cc/chunk/KfsOps.cc
index 78079707a..f9ebcc275 100644
--- a/src/cc/chunk/KfsOps.cc
+++ b/src/cc/chunk/KfsOps.cc
@@ -862,6 +862,33 @@ KfsOp::GetClientSM()
     return (clientSMFlag ? static_cast<ClientSM*>(clnt) : 0);
 }
 
+bool
+WritePrepareOp::Validate()
+{
+    if (checksumsCnt <= 0) {
+        checksumsVal.clear();
+        return ChunkAccessRequestOp::Validate();
+    }
+    const char*       ptr = checksumsVal.mPtr;
+    const char* const end = ptr + checksumsVal.mLen;
+    blocksChecksums.clear();
+    blocksChecksums.reserve(checksumsCnt);
+    for (int i = 0; i < checksumsCnt; i++) {
+        uint32_t cksum = 0;
+        if (! (initialShortRpcFormatFlag ?
+                ValueParserT<HexIntParser>::ParseInt(ptr, end - ptr, cksum) :
+                ValueParserT<DecIntParser>::ParseInt(ptr, end - ptr, cksum))) {
+            return false;
+        }
+        blocksChecksums.push_back(cksum);
+        while (ptr < end && (*ptr & 0xFF) > ' ') {
+            ++ptr;
+        }
+    }
+    checksumsVal.clear();
+    return ChunkAccessRequestOp::Validate();
+}
+
 bool
 WriteSyncOp::Validate()
 {
@@ -1187,7 +1214,7 @@ WriteOp::HandleWriteDone(int code, void* data)
 void
 CloseOp::Execute()
 {
-    KFS_LOG_STREAM_INFO <<
+    KFS_LOG_STREAM_DEBUG <<
         "closing"
         " chunk: "   << chunkId <<
         " version: " << chunkVersion <<
@@ -1199,6 +1226,7 @@ CloseOp::Execute()
     int64_t        writeId       = -1;
     bool           needToForward = needToForwardToPeer(shortRpcFormatFlag,
         servers, numServers, myPos, peerLoc, hasWriteId, writeId);
+    needToForward = ! noForwardFlag && needToForward;
     if (chunkVersion < 0 && needToForward && hasWriteId) {
         status    = -EINVAL;
         statusMsg = "invalid object store file block close";
@@ -1253,8 +1281,12 @@ CloseOp::Execute()
                         waitReadableFlag ? &readMetaFlag : 0
                 );
                 if (ret < 0) {
-                    status    = ret;
-                    statusMsg = "invalid write or chunk id";
+                    if (! needAck && ret == -EBADF) {
+                        status = 0;
+                    } else {
+                        status    = ret;
+                        statusMsg = "invalid write or chunk id";
+                    }
                 }
                 if (waitReadableFlag && 0 <= ret) {
                     return;
@@ -1329,6 +1361,7 @@ CloseOp::HandleDone(int code, void* data)
 void
 AllocChunkOp::Execute()
 {
+    debugStartUsec = microseconds();
     int            myPos   = -1;
     int64_t        writeId = -1;
     ServerLocation peerLoc;
@@ -1427,6 +1460,7 @@ AllocChunkOp::HandleChunkAllocDone(int code, void* data)
         }
         if (! diskIo) {
             SET_HANDLER(this, &AllocChunkOp::HandleChunkAllocDone);
+            debugBeforeAllocUsec = microseconds();
             if (appendFlag) {
                 int            myPos   = -1;
                 int64_t        writeId = -1;
@@ -1449,9 +1483,11 @@ AllocChunkOp::HandleChunkAllocDone(int code, void* data)
                     this
                 );
             }
+            debugAfterAllocUsec = microseconds();
             if (diskIo) {
                 // File create is in progress. This method will be called again
                 // when create / open completes.
+                debugDiskWaitStartUsec = microseconds();
                 assert(status == 0);
                 return 0;
             }
@@ -1460,6 +1496,32 @@ AllocChunkOp::HandleChunkAllocDone(int code, void* data)
             gLeaseClerk.RegisterLease(*this);
         }
     }
+    const int64_t nowUsec = microseconds();
+    const int64_t totalUsec = debugStartUsec > 0 ?
+        nowUsec - debugStartUsec : 0;
+    if (100000 <= totalUsec) {
+        KFS_LOG_STREAM_INFO <<
+            "alloc-chunk timing:"
+            " seq: " << seq <<
+            " file: " << fileId <<
+            " chunk: " << chunkId <<
+            " version: " << chunkVersion <<
+            " status: " << status <<
+            " total-usec: " << totalUsec <<
+            " pre-alloc-usec: " <<
+                (debugBeforeAllocUsec > debugStartUsec ?
+                    debugBeforeAllocUsec - debugStartUsec : 0) <<
+            " alloc-call-usec: " <<
+                (debugAfterAllocUsec > debugBeforeAllocUsec ?
+                    debugAfterAllocUsec - debugBeforeAllocUsec : 0) <<
+            " disk-wait-usec: " <<
+                (debugDiskWaitStartUsec > 0 ?
+                    nowUsec - debugDiskWaitStartUsec : 0) <<
+            " post-alloc-usec: " <<
+                (debugAfterAllocUsec > 0 ?
+                    nowUsec - debugAfterAllocUsec : 0) <<
+        KFS_LOG_EOM;
+    }
     diskIo.reset();
     Submit();
     return 0;
@@ -2331,8 +2393,9 @@ WriteIdAllocOp::Execute()
     int64_t        dummyWriteId  = -1;
     int            myPos         = -1;
     ServerLocation peerLoc;
-    const bool     needToForward = needToForwardToPeer(shortRpcFormatFlag,
+    bool           needToForward = needToForwardToPeer(shortRpcFormatFlag,
         servers, numServers, myPos, peerLoc, false, dummyWriteId);
+    needToForward = ! noForwardFlag && needToForward;
     if (myPos < 0) {
         statusMsg = "invalid or missing Servers: field";
         status    = -EINVAL;
@@ -2342,6 +2405,19 @@ WriteIdAllocOp::Execute()
     const bool writeMaster          = myPos == 0;
     bool       allowCSClearTextFlag = chunkAccessTokenValidFlag &&
         (chunkAccessFlags & ChunkAccessToken::kAllowClearTextFlag) != 0;
+    if (writeMaster && gChunkManager.IsLazyCreateOnWriteEnabled() &&
+            0 <= leaseId && 0 <= fileId && ! gLeaseClerk.IsLeaseValid(
+            chunkId, chunkVersion, 0, 0)) {
+        AllocChunkOp leaseOp;
+        leaseOp.fileId               = fileId;
+        leaseOp.chunkId              = chunkId;
+        leaseOp.chunkVersion         = chunkVersion;
+        leaseOp.leaseId              = leaseId;
+        leaseOp.appendFlag           = false;
+        leaseOp.allowCSClearTextFlag = allowCSClearTextFlag;
+        leaseOp.shortRpcFormatFlag   = shortRpcFormatFlag;
+        gLeaseClerk.RegisterLease(leaseOp);
+    }
     if (writeMaster && ! gLeaseClerk.IsLeaseValid(
             chunkId, chunkVersion,
             &syncReplicationAccess, &allowCSClearTextFlag)) {
@@ -2375,6 +2451,8 @@ WriteIdAllocOp::Execute()
     }
     if (needToForward) {
         ForwardToPeer(peerLoc, writeMaster, allowCSClearTextFlag);
+    } else if (lazyChunkCreatedFlag) {
+        WriteLazyCreatedChunkMetadata();
     } else {
         ReadChunkMetadata();
     }
@@ -2427,10 +2505,29 @@ WriteIdAllocOp::HandlePeerReply(int code, void* data)
         initialShortRpcFormatFlag, peerShortRpcFormatFlag);
     writePrepareReplyFlag =
         writePrepareReplyFlag && fwdedOp->writePrepareReplyFlag;
+    if (lazyChunkCreatedFlag) {
+        return WriteLazyCreatedChunkMetadata();
+    }
     ReadChunkMetadata();
     return 0;
 }
 
+int
+WriteIdAllocOp::WriteLazyCreatedChunkMetadata()
+{
+    assert(status == 0);
+    SET_HANDLER(this, &WriteIdAllocOp::Done);
+    const int ret = gChunkManager.WriteChunkMetadata(
+        chunkId, chunkVersion, this);
+    if (0 <= ret) {
+        return 0;
+    }
+    if (0 <= status) {
+        status = ret;
+    }
+    return Done(EVENT_CMD_DONE, this);
+}
+
 void
 WriteIdAllocOp::ReadChunkMetadata()
 {
@@ -2474,7 +2571,7 @@ WriteIdAllocOp::Done(int code, void* data)
         }
     }
     KFS_LOG_STREAM(
-        status == 0 ? MsgLogger::kLogLevelINFO : MsgLogger::kLogLevelERROR) <<
+        status == 0 ? MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) <<
         (status == 0 ? "done: " : "failed: ") << Show() <<
     KFS_LOG_EOM;
     Submit();
@@ -2489,8 +2586,9 @@ WritePrepareOp::Execute()
     // check if we need to forward anywhere
     ServerLocation peerLoc;
     int            myPos         = -1;
-    const bool     needToForward = needToForwardToPeer(shortRpcFormatFlag,
+    bool           needToForward = needToForwardToPeer(shortRpcFormatFlag,
         servers, numServers, myPos, peerLoc, true, writeId);
+    needToForward = ! noForwardFlag && needToForward;
     if (myPos < 0) {
         statusMsg = "invalid or missing Servers: field";
         status = -EINVAL;
@@ -2542,9 +2640,22 @@ WritePrepareOp::Execute()
     }
 
     if (blocksChecksums.empty()) {
-        blocksChecksums = ComputeChecksums(&dataBuf, numBytes, &receivedChecksum);
+        blocksChecksums = ComputeChecksums(
+            &dataBuf, numBytes, &receivedChecksum);
+    } else if (! gChunkManager.IsWritePrepareChecksumVerifySkipped()) {
+        receivedChecksum = ComputeBlockChecksum(&dataBuf, numBytes);
+    }
+    if (gChunkManager.IsWritePrepareChecksumVerifySkipped() &&
+            (offset % CHECKSUM_BLOCKSIZE != 0 ||
+            numBytes % CHECKSUM_BLOCKSIZE != 0 ||
+            blocksChecksums.size() != numBytes / CHECKSUM_BLOCKSIZE)) {
+        statusMsg = "invalid write checksum vector";
+        status = -EINVAL;
+        Done(EVENT_CMD_DONE, this);
+        return;
     }
-    if (receivedChecksum != checksum) {
+    if (! gChunkManager.IsWritePrepareChecksumVerifySkipped() &&
+            receivedChecksum != checksum) {
         statusMsg = "checksum mismatch";
         KFS_LOG_STREAM_ERROR <<
             "checksum mismatch: sent: " << checksum <<
@@ -2649,7 +2760,7 @@ WritePrepareOp::Done(int code, void* data)
         return 0;
     }
     KFS_LOG_STREAM(
-        status >= 0 ? MsgLogger::kLogLevelINFO : MsgLogger::kLogLevelERROR) <<
+        status >= 0 ? MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) <<
         (status >= 0 ? "done: " : "failed: ") << Show() <<
         " status: " << status <<
         (statusMsg.empty() ? "" : " msg: ") << statusMsg <<
@@ -2669,8 +2780,9 @@ WriteSyncOp::Execute()
     ServerLocation peerLoc;
     int            myPos = -1;
     // check if we need to forward anywhere
-    const bool needToForward = needToForwardToPeer(shortRpcFormatFlag,
+    bool needToForward = needToForwardToPeer(shortRpcFormatFlag,
         servers, numServers, myPos, peerLoc, true, writeId);
+    needToForward = ! noForwardFlag && needToForward;
     if (myPos < 0) {
         statusMsg = "invalid or missing Servers: field";
         status = -EINVAL;
@@ -2890,7 +3002,7 @@ WriteSyncOp::Done(int code, void* data)
         return 0;
     }
     KFS_LOG_STREAM(
-        status >= 0 ? MsgLogger::kLogLevelINFO : MsgLogger::kLogLevelERROR) <<
+        status >= 0 ? MsgLogger::kLogLevelDEBUG : MsgLogger::kLogLevelERROR) <<
         (status >= 0 ? "done: " : "failed: ") << Show() <<
         " status: " << status <<
         (statusMsg.empty() ? "" : " msg: ") << statusMsg <<
@@ -3598,7 +3710,16 @@ WriteIdAllocOp::Request(ReqOstream& os)
         os << "Version: "       << KFS_VERSION_STR << "\r\n";
     }
     os <<
-    (shortRpcFormatFlag ? "H:" : "Chunk-handle: ")  << chunkId      << "\r\n" <<
+    (shortRpcFormatFlag ? "H:" : "Chunk-handle: ")  << chunkId      << "\r\n";
+    if (fileId >= 0) {
+        os << (shortRpcFormatFlag ? "P:" : "File-handle: ") <<
+            fileId << "\r\n";
+    }
+    if (leaseId >= 0) {
+        os << (shortRpcFormatFlag ? "L:" : "Lease-id: ") <<
+            leaseId << "\r\n";
+    }
+    os <<
     (shortRpcFormatFlag ? "V:" : "Chunk-version: ") << chunkVersion << "\r\n" <<
     (shortRpcFormatFlag ? "O:" : "Offset: ")        << offset       << "\r\n" <<
     (shortRpcFormatFlag ? "B:" : "Num-bytes: ")     << numBytes     << "\r\n" <<
diff --git a/src/cc/chunk/KfsOps.h b/src/cc/chunk/KfsOps.h
index 43e2d30a9..de21b5020 100644
--- a/src/cc/chunk/KfsOps.h
+++ b/src/cc/chunk/KfsOps.h
@@ -510,6 +510,10 @@ struct AllocChunkOp : public KfsOp {
     int                   chunkAccessLength;
     SyncReplicationAccess syncReplicationAccess;
     DiskIoPtr             diskIo;
+    int64_t               debugStartUsec;
+    int64_t               debugBeforeAllocUsec;
+    int64_t               debugAfterAllocUsec;
+    int64_t               debugDiskWaitStartUsec;
 
     AllocChunkOp()
         : KfsOp(CMD_ALLOC_CHUNK),
@@ -530,7 +534,11 @@ struct AllocChunkOp : public KfsOp {
           contentLength(0),
           chunkAccessLength(0),
           syncReplicationAccess(),
-          diskIo()
+          diskIo(),
+          debugStartUsec(0),
+          debugBeforeAllocUsec(0),
+          debugAfterAllocUsec(0),
+          debugDiskWaitStartUsec(0)
         {}
     void Execute();
     // handlers for reading/writing out the chunk meta-data
@@ -1012,6 +1020,7 @@ struct RetireOp : public KfsOp {
 struct CloseOp : public KfsClientChunkOp {
     uint32_t              numServers;      // input
     bool                  needAck;         // input: when set, this RPC is ack'ed
+    bool                  noForwardFlag;   // input: do not forward to peer
     bool                  hasWriteId;      // input
     int64_t               masterCommitted; // input
     StringBufT<256>       servers;         // input: set of servers on which to chunk is to be closed
@@ -1024,6 +1033,7 @@ struct CloseOp : public KfsClientChunkOp {
         : KfsClientChunkOp(CMD_CLOSE),
           numServers           (0u),
           needAck              (true),
+          noForwardFlag        (false),
           hasWriteId           (false),
           masterCommitted      ((int64_t)-1),
           servers              (),
@@ -1036,6 +1046,7 @@ struct CloseOp : public KfsClientChunkOp {
         : KfsClientChunkOp(CMD_CLOSE),
           numServers           (op.numServers),
           needAck              (op.needAck),
+          noForwardFlag        (op.noForwardFlag),
           hasWriteId           (op.hasWriteId),
           masterCommitted      (op.masterCommitted),
           servers              (op.servers),
@@ -1089,6 +1100,7 @@ struct CloseOp : public KfsClientChunkOp {
         .Def2("Num-servers",      "R",  &CloseOp::numServers)
         .Def2("Servers",          "S",  &CloseOp::servers)
         .Def2("Need-ack",         "A",  &CloseOp::needAck,         true)
+        .Def2("No-forward",       "NF", &CloseOp::noForwardFlag,  false)
         .Def2("Has-write-id",     "W",  &CloseOp::hasWriteId,      false)
         .Def2("Master-committed", "M",  &CloseOp::masterCommitted, int64_t(-1))
         .Def2("C-access-length",  "AL", &CloseOp::chunkAccessLength)
@@ -1238,6 +1250,8 @@ struct GetRecordAppendOpStatus : public KfsClientChunkOp
 };
 
 struct WriteIdAllocOp : public ChunkAccessRequestOp {
+    kfsFileId_t           fileId;
+    int64_t               leaseId;
     kfsSeq_t              clientSeq;         /* input */
     int64_t               offset;            /* input */
     size_t                numBytes;          /* input */
@@ -1246,15 +1260,19 @@ struct WriteIdAllocOp : public ChunkAccessRequestOp {
     StringBufT<256>       servers;           /* input: set of servers on which to write */
     WriteIdAllocOp*       fwdedOp;           /* if we did any fwd'ing, this is the op that tracks it */
     bool                  isForRecordAppend; /* set if the write-id-alloc is for a record append that will follow */
-    bool                  writePrepareReplyFlag; /* write prepare reply supported */
+    bool                  writePrepareReplyFlag;
+    bool                  noForwardFlag;
     bool                  peerShortRpcFormatFlag;
     int                   contentLength;
     int                   chunkAccessLength;
     SyncReplicationAccess syncReplicationAccess;
     RemoteSyncSMPtr       appendPeer;
+    bool                  lazyChunkCreatedFlag;
 
     WriteIdAllocOp()
         : ChunkAccessRequestOp(CMD_WRITE_ID_ALLOC),
+          fileId(-1),
+          leaseId(-1),
           clientSeq(-1),
           offset(0),
           numBytes(0),
@@ -1264,14 +1282,18 @@ struct WriteIdAllocOp : public ChunkAccessRequestOp {
           fwdedOp(0),
           isForRecordAppend(false),
           writePrepareReplyFlag(true),
+          noForwardFlag(false),
           peerShortRpcFormatFlag(false),
           contentLength(0),
           chunkAccessLength(0),
           syncReplicationAccess(),
-          appendPeer()
+          appendPeer(),
+          lazyChunkCreatedFlag(false)
         { SET_HANDLER(this, &WriteIdAllocOp::Done); }
     WriteIdAllocOp(const WriteIdAllocOp& other)
         : ChunkAccessRequestOp(CMD_WRITE_ID_ALLOC),
+          fileId(other.fileId),
+          leaseId(other.leaseId),
           clientSeq(other.clientSeq),
           offset(other.offset),
           numBytes(other.numBytes),
@@ -1280,11 +1302,13 @@ struct WriteIdAllocOp : public ChunkAccessRequestOp {
           fwdedOp(0),
           isForRecordAppend(other.isForRecordAppend),
           writePrepareReplyFlag(other.writePrepareReplyFlag),
+          noForwardFlag(other.noForwardFlag),
           peerShortRpcFormatFlag(false),
           contentLength(other.contentLength),
           chunkAccessLength(other.chunkAccessLength),
           syncReplicationAccess(other.syncReplicationAccess),
-          appendPeer()
+          appendPeer(),
+          lazyChunkCreatedFlag(false)
     {
         chunkId                   = other.chunkId;
         chunkVersion              = other.chunkVersion;
@@ -1306,6 +1330,7 @@ struct WriteIdAllocOp : public ChunkAccessRequestOp {
     // write-id alloc op as a hint to page the data back in---writes
     // are coming.
     void ReadChunkMetadata();
+    int WriteLazyCreatedChunkMetadata();
 
     void ForwardToPeer(
         const ServerLocation& loc,
@@ -1349,6 +1374,8 @@ struct WriteIdAllocOp : public ChunkAccessRequestOp {
     template<typename T> static T& ParserDef(T& parser)
     {
         return ChunkAccessRequestOp::ParserDef(parser)
+        .Def2("File-handle",         "P",  &WriteIdAllocOp::fileId, kfsFileId_t(-1))
+        .Def2("Lease-id",            "L",  &WriteIdAllocOp::leaseId, int64_t(-1))
         .Def2("Offset",              "O",  &WriteIdAllocOp::offset)
         .Def2("Num-bytes",           "B",  &WriteIdAllocOp::numBytes)
         .Def2("Num-servers",         "R",  &WriteIdAllocOp::numServers)
@@ -1356,6 +1383,7 @@ struct WriteIdAllocOp : public ChunkAccessRequestOp {
         .Def2("For-record-append",   "A",  &WriteIdAllocOp::isForRecordAppend, false)
         .Def2("Client-cseq",         "Cc", &WriteIdAllocOp::clientSeq)
         .Def2("Write-prepare-reply", "WR", &WriteIdAllocOp::writePrepareReplyFlag)
+        .Def2("No-forward",          "NF", &WriteIdAllocOp::noForwardFlag, false)
         .Def2("Content-length",      "l",  &WriteIdAllocOp::contentLength, 0)
         .Def2("C-access-length",     "AL", &WriteIdAllocOp::chunkAccessLength)
         ;
@@ -1369,6 +1397,7 @@ struct WritePrepareOp : public ChunkAccessRequestOp {
     uint32_t              checksum;   /* input: as computed by the sender; 0 means sender didn't send */
     StringBufT<256>       servers;    /* input: set of servers on which to write */
     bool                  replyRequestedFlag;
+    bool                  noForwardFlag;
     int                   accessFwdLength;
     int                   chunkAccessLength;
     SyncReplicationAccess syncReplicationAccess;
@@ -1378,6 +1407,8 @@ struct WritePrepareOp : public ChunkAccessRequestOp {
     uint32_t              numDone;    // sub/forwarding ops count
     BufferManager*        devBufMgr;
     uint32_t              receivedChecksum;
+    int                   checksumsCnt;
+    TokenValue            checksumsVal;
     vector<uint32_t>      blocksChecksums;
 
     WritePrepareOp()
@@ -1388,6 +1419,7 @@ struct WritePrepareOp : public ChunkAccessRequestOp {
           checksum(0),
           servers(),
           replyRequestedFlag(false),
+          noForwardFlag(false),
           accessFwdLength(0),
           chunkAccessLength(0),
           syncReplicationAccess(),
@@ -1397,6 +1429,8 @@ struct WritePrepareOp : public ChunkAccessRequestOp {
           numDone(0),
           devBufMgr(0),
           receivedChecksum(0),
+          checksumsCnt(0),
+          checksumsVal(),
           blocksChecksums()
         { SET_HANDLER(this, &WritePrepareOp::Done); }
     ~WritePrepareOp();
@@ -1407,6 +1441,7 @@ struct WritePrepareOp : public ChunkAccessRequestOp {
         return syncReplicationAccess.Parse(
             is, chunkAccessLength, accessFwdLength);
     }
+    bool Validate();
     void Response(ReqOstream& os);
     void Execute();
     void ForwardToPeer(
@@ -1439,7 +1474,10 @@ struct WritePrepareOp : public ChunkAccessRequestOp {
         .Def2("Num-servers",       "R",  &WritePrepareOp::numServers)
         .Def2("Servers",           "S",  &WritePrepareOp::servers)
         .Def2("Checksum",          "K",  &WritePrepareOp::checksum)
+        .Def2("Checksum-entries", "KC", &WritePrepareOp::checksumsCnt)
+        .Def2("Checksums",        "Ks", &WritePrepareOp::checksumsVal)
         .Def2("Reply",             "RR", &WritePrepareOp::replyRequestedFlag)
+        .Def2("No-forward",        "NF", &WritePrepareOp::noForwardFlag, false)
         .Def2("Access-fwd-length", "AF", &WritePrepareOp::accessFwdLength, 0)
         .Def2("C-access-length",   "AL", &WritePrepareOp::chunkAccessLength)
         ;
@@ -1583,6 +1621,7 @@ struct WriteSyncOp : public ChunkAccessRequestOp {
     uint32_t                  numServers;
     StringBufT<256>           servers;
     WriteSyncOp*              fwdedOp;
+    bool                      noForwardFlag;
     WriteOp*                  writeOp; // the underlying write that needs to be pushed to disk
     uint32_t                  numDone; // if we did forwarding, we wait for
                                        // local/remote to be done; otherwise, we only
@@ -1650,6 +1689,7 @@ struct WriteSyncOp : public ChunkAccessRequestOp {
         .Def2("Servers",          "S",  &WriteSyncOp::servers)
         .Def2("Checksum-entries", "KC", &WriteSyncOp::checksumsCnt)
         .Def2("Checksums",        "K",  &WriteSyncOp::checksumsVal)
+        .Def2("No-forward",       "NF", &WriteSyncOp::noForwardFlag, false)
         .Def2("Content-length",   "l",  &WriteSyncOp::contentLength, 0)
         .Def2("C-access-length",  "AL", &WriteSyncOp::chunkAccessLength)
         ;
diff --git a/src/cc/chunk/LeaseClerk.cc b/src/cc/chunk/LeaseClerk.cc
index ffee00897..925ab6091 100644
--- a/src/cc/chunk/LeaseClerk.cc
+++ b/src/cc/chunk/LeaseClerk.cc
@@ -367,7 +367,7 @@ LeaseClerk::RelinquishLease(kfsChunkId_t chunkId, int64_t chunkVersion,
     const LeaseInfo_t& lease = *it;
     LeaseRelinquishOp* const op = new LeaseRelinquishOp(
         chunkId, chunkVersion, lease.leaseId, kWriteLease);
-    KFS_LOG_STREAM_INFO <<
+    KFS_LOG_STREAM_DEBUG <<
         "sending lease relinquish for:"
         " chunk: "      << chunkId <<
         " version: "    << chunkVersion <<
diff --git a/src/cc/common/BufferedLogWriter.cc b/src/cc/common/BufferedLogWriter.cc
index 8846f5d14..528b43c45 100644
--- a/src/cc/common/BufferedLogWriter.cc
+++ b/src/cc/common/BufferedLogWriter.cc
@@ -316,7 +316,7 @@ class BufferedLogWriter::Impl : public QCRunnable
             return;
         }
         mRunFlag = true;
-        const int kStackSize = 64 << 10;
+        const int kStackSize = 256 << 10;
         mThread.Start(this, kStackSize, 0,
             QCThread::CpuAffinity(mCpuAffinityIndex));
     }
diff --git a/src/cc/kfsio/IOBuffer.cc b/src/cc/kfsio/IOBuffer.cc
index d0f43cfec..0ee0e9d92 100644
--- a/src/cc/kfsio/IOBuffer.cc
+++ b/src/cc/kfsio/IOBuffer.cc
@@ -625,6 +625,29 @@ IOBuffer::Append(IOBuffer *ioBuf)
     return nBytes;
 }
 
+IOBuffer::BufPos
+IOBuffer::AppendShared(const IOBuffer& other)
+{
+    DebugChecksum(other, other.mByteCount);
+    BufPos nBytes = 0;
+    for (BList::const_iterator it = other.mBuf.begin();
+            it != other.mBuf.end();
+            ++it) {
+        const BufPos nb = it->BytesConsumable();
+        if (nb > 0) {
+            mBuf.push_back(IOBufferData(*it,
+                const_cast<char*>(it->Consumer()),
+                const_cast<char*>(it->Producer())));
+            nBytes += nb;
+        }
+    }
+    assert(mByteCount >= 0);
+    mByteCount += nBytes;
+    DebugVerify();
+    other.DebugVerify();
+    return nBytes;
+}
+
 inline IOBuffer::BList::iterator
 IOBuffer::BeginSpaceAvailable(IOBuffer::BufPos* nBytes /* = 0 */)
 {
diff --git a/src/cc/kfsio/IOBuffer.h b/src/cc/kfsio/IOBuffer.h
index d782ca3f9..6353487d9 100644
--- a/src/cc/kfsio/IOBuffer.h
+++ b/src/cc/kfsio/IOBuffer.h
@@ -283,6 +283,10 @@ class IOBuffer
     /// Append the contents of ioBuf to this buffer.
     BufPos Append(IOBuffer *ioBuf);
 
+    // Append shared references to the consumable blocks in other.
+    // This does not modify other and does not copy payload bytes.
+    BufPos AppendShared(const IOBuffer& other);
+
     /// Move data buffers with space available at the end of ioBuf.
     /// @param[in] other  Buffer from which the available space to move
     /// @param[in] numBytes  # of bytes of available space to be used
diff --git a/src/cc/libclient/KfsClient.cc b/src/cc/libclient/KfsClient.cc
index fbd70de6e..97980cb77 100644
--- a/src/cc/libclient/KfsClient.cc
+++ b/src/cc/libclient/KfsClient.cc
@@ -1324,6 +1324,7 @@ class KfsClientImpl::ClientsList
         vector<kfsGid_t> mGroups;
         int              mDefaultFileAttributeRevalidateTime;
         unsigned int     mDefaultFileAttributeRevalidateScan;
+        size_t           mDefaultMaxFAttrCacheSize;
 
         static const Globals& Get()
         {
@@ -1341,8 +1342,9 @@ class KfsClientImpl::ClientsList
               mEUser(geteuid()),
               mEGroup(getegid()),
               mGroups(),
-              mDefaultFileAttributeRevalidateTime(30),
-              mDefaultFileAttributeRevalidateScan(64)
+              mDefaultFileAttributeRevalidateTime(3600),
+              mDefaultFileAttributeRevalidateScan(64),
+              mDefaultMaxFAttrCacheSize(262144)
         {
             signal(SIGPIPE, SIG_IGN);
             libkfsio::InitGlobals();
@@ -1389,6 +1391,19 @@ class KfsClientImpl::ClientsList
                     }
                 }
             }
+            const char* cacheSzPtr =
+                getenv("QFS_CLIENT_MAX_FATTR_CACHE_SIZE");
+            if (! cacheSzPtr) {
+                cacheSzPtr = getenv("KFS_CLIENT_MAX_FATTR_CACHE_SIZE");
+            }
+            if (cacheSzPtr) {
+                char* e = 0;
+                const long v = strtol(cacheSzPtr, &e, 10);
+                if (cacheSzPtr < e && (*e & 0xFF) <= ' ') {
+                    mDefaultMaxFAttrCacheSize = (size_t)max(
+                        16L << 10, v);
+                }
+            }
         }
         ~Globals()
             { Instance().Shutdown(); }
@@ -1414,6 +1429,16 @@ class KfsClientImpl::ClientsList
             globals.mDefaultFileAttributeRevalidateTime;
         client.mFileAttributeRevalidateScan =
             globals.mDefaultFileAttributeRevalidateScan;
+        client.mMaxFAttrCacheSize = globals.mDefaultMaxFAttrCacheSize;
+        client.mLookupRpcCount = 0;
+        client.mLookupPathCacheQueryCount = 0;
+        client.mLookupPathCacheHitCount = 0;
+        client.mLookupPathCacheStaleCount = 0;
+        client.mLookupPathCacheMissCount = 0;
+        client.mLookupFidNameCacheQueryCount = 0;
+        client.mLookupFidNameCacheHitCount = 0;
+        client.mLookupFidNameCacheStaleCount = 0;
+        client.mLookupFidNameCacheMissCount = 0;
         client.mClientId = mNextClientId++;
     }
     void RemoveSelf(KfsClientImpl& client)
@@ -1526,9 +1551,19 @@ KfsClientImpl::KfsClientImpl(
       mDeleteClearFattr(0),
       mFreeFileTableEntires(),
       mFattrCacheSkipValidateCnt(0),
-      mFileAttributeRevalidateTime(30),
+      mFileAttributeRevalidateTime(3600),
       mFileAttributeRevalidateScan(64),
       mFAttrCacheGeneration(1),
+      mMaxFAttrCacheSize(262144),
+      mLookupRpcCount(0),
+      mLookupPathCacheQueryCount(0),
+      mLookupPathCacheHitCount(0),
+      mLookupPathCacheStaleCount(0),
+      mLookupPathCacheMissCount(0),
+      mLookupFidNameCacheQueryCount(0),
+      mLookupFidNameCacheHitCount(0),
+      mLookupFidNameCacheStaleCount(0),
+      mLookupFidNameCacheMissCount(0),
       mTmpPath(),
       mTmpAbsPathStr(),
       mTmpAbsPath(),
@@ -2106,10 +2141,11 @@ KfsClientImpl::Mkdir(const char *pathname, kfsMode_t mode)
 
     kfsFileId_t parentFid;
     string      dirname;
+    string      path;
     const bool  kInvalidateSubCountsFlag = true;
     const bool  kEnforceLastDirFlag      = false;
     int         res                      = GetPathComponents(
-        pathname, &parentFid, dirname, 0,
+        pathname, &parentFid, dirname, &path,
         kInvalidateSubCountsFlag, kEnforceLastDirFlag);
     if (res < 0) {
         return res;
@@ -2126,6 +2162,10 @@ KfsClientImpl::Mkdir(const char *pathname, kfsMode_t mode)
     if (op.status < 0) {
         return GetOpStatus(op);
     }
+    if (0 <= op.fileId) {
+        CacheCreatedEntry(parentFid, dirname, path, op.fileId,
+            op.permissions, true);
+    }
     time_t now = 0; // assign to suppress compiler warning.
     if (! op.userName.empty()) {
         now = time(0);
@@ -3528,13 +3568,26 @@ KfsClientImpl::CreateSelf(const char *pathname, int numReplicas, bool exclusive,
     const bool  kInvalidateSubCountsFlag = true;
     res = GetPathComponents(pathname, &parentFid, filename, &path,
         kInvalidateSubCountsFlag);
-    Delete(LookupFAttr(parentFid, filename));
     if (res < 0) {
         KFS_LOG_STREAM_DEBUG <<
             pathname << ": GetPathComponents: " << res <<
         KFS_LOG_EOM;
         return res;
     }
+    return CreateSelfResolved(pathname, parentFid, filename, path,
+        numReplicas, exclusive, numStripes, numRecoveryStripes, stripeSize,
+        stripedType, forceTypeFlag, mode, minSTier, maxSTier);
+}
+
+int
+KfsClientImpl::CreateSelfResolved(const char *pathname, kfsFileId_t parentFid,
+    const string& filename, const string& path, int numReplicas,
+    bool exclusive, int numStripes, int numRecoveryStripes, int stripeSize,
+    int stripedType, bool forceTypeFlag, kfsMode_t mode,
+    kfsSTier_t minSTier, kfsSTier_t maxSTier)
+{
+    assert(mMutex.IsOwned());
+    Delete(LookupFAttr(parentFid, filename));
     CreateOp op(0, parentFid, filename.c_str(), numReplicas, exclusive,
         Permissions(
             mUseOsUserAndGroupFlag ? mEUser  : kKfsUserNone,
@@ -3572,33 +3625,30 @@ KfsClientImpl::CreateSelf(const char *pathname, int numReplicas, bool exclusive,
             " striped file type " << op.striperType <<
             " is not supported "  << " got: " << op.metaStriperType <<
         KFS_LOG_EOM;
-        // Cleanup the file.
         RemoveOp rm(0, parentFid, filename.c_str(), pathname);
         DoMetaOpWithRetry(&rm);
         return -ENXIO;
     }
+    if (0 <= op.fileId) {
+        CacheCreatedEntry(parentFid, filename, path, op.fileId,
+            op.permissions, false);
+    }
 
-    // Do not attempt to re-use possibly existing file table entry.
-    // If file existed and being written into it is moved into the dumpster by
-    // the meta server.
-    // An attempt to re-use the same file table entry would route the ios to the
-    // previously existed file into newly created one.
     const int fte = AllocFileTableEntry(parentFid, filename, path);
-    if (fte < 0) {      // XXX Too many open files
+    if (fte < 0) {
         KFS_LOG_STREAM_DEBUG <<
             pathname << ": AllocFileTableEntry: " << fte <<
         KFS_LOG_EOM;
         return fte;
     }
 
-    // make it the same as creat(): equivalent to open(O_CREAT|O_WRONLY|O_TRUNC).
     FileTableEntry& entry = *mFileTable[fte];
     entry.openMode        = O_WRONLY;
     FileAttr& fa = entry.fattr;
-    fa.Init(false);    // is an ordinary file
+    fa.Init(false);
     fa.fileId      = op.fileId;
     fa.numReplicas = op.metaNumReplicas;
-    fa.fileSize    = 0; // presently CreateOp always deletes file if exists.
+    fa.fileSize    = 0;
     fa.minSTier    = op.minSTier;
     fa.maxSTier    = op.maxSTier;
     if (op.metaStriperType != KFS_STRIPED_FILE_TYPE_NONE) {
@@ -3619,7 +3669,6 @@ KfsClientImpl::CreateSelf(const char *pathname, int numReplicas, bool exclusive,
         }
         UpdateGroupId(op.groupName, fa.group, now);
     }
-    // Set optimal io size, like open does.
     SetOptimalReadAheadSize(entry, mDefaultReadAheadSize);
     SetOptimalIoBufferSize(entry, mDefaultIoBufferSize);
     KFS_LOG_STREAM_DEBUG <<
@@ -4149,7 +4198,15 @@ KfsClientImpl::OpenSelf(const char *pathname, int openMode, int numReplicas,
     kfsFileId_t parentFid = -1;
     string      filename;
     string      fpath;
-    const int res = GetPathComponents(pathname, &parentFid, filename, &fpath);
+    const bool  createFastPathFlag =
+        ! cacheAttributesFlag && (openMode & O_CREAT) != 0 &&
+        (openMode & (O_EXCL | O_TRUNC | O_APPEND)) == 0 &&
+        (openMode & (O_RDWR | O_WRONLY)) != 0;
+    const bool  kInvalidateSubCountsFlag = false;
+    const bool  kEnforceLastDirFlag      = true;
+    const bool  kFollowSymLinkFlag       = ! createFastPathFlag;
+    int res = GetPathComponents(pathname, &parentFid, filename, &fpath,
+        kInvalidateSubCountsFlag, kEnforceLastDirFlag, kFollowSymLinkFlag);
     if (res < 0) {
         return res;
     }
@@ -4158,6 +4215,27 @@ KfsClientImpl::OpenSelf(const char *pathname, int openMode, int numReplicas,
     if (path) {
         *path = fpath;
     }
+    if (createFastPathFlag) {
+        int cres = KfsClient::ValidateCreateParams(
+            numReplicas, numStripes, numRecoveryStripes,
+            stripeSize, stripedType, minSTier, maxSTier);
+        if (cres < 0) {
+            return cres;
+        }
+        const int fte = CreateSelfResolved(pathname, parentFid, filename, fpath,
+            numReplicas, true /* exclusive */, numStripes, numRecoveryStripes,
+            stripeSize, stripedType, false, mode, minSTier, maxSTier);
+        if (fte >= 0 || fte != -EEXIST) {
+            return fte;
+        }
+        res = GetPathComponents(pathname, &parentFid, filename, &fpath);
+        if (res < 0) {
+            return res;
+        }
+        if (path) {
+            *path = fpath;
+        }
+    }
     bool         objectStoreTruncateFlag   = false;
     LookupOp     op(0, parentFid, filename.c_str());
     FAttr*       fa                        = LookupFAttr(parentFid, filename);
@@ -4848,6 +4926,7 @@ KfsClientImpl::StartProtocolWorker()
         return;
     }
     KfsProtocolWorker::Parameters params;
+    params.mUseClientPoolFlag = true;
     if (mProtocolWorkerAuthCtx.IsEnabled()) {
         params.mAuthContextPtr = &mProtocolWorkerAuthCtx;
     }
@@ -4881,6 +4960,9 @@ KfsClientImpl::StartProtocolWorker()
     }
     params.mUseClientPoolFlag = mConfig.getValue(
         "client.connectionPool", params.mUseClientPoolFlag ? 1 : 0) != 0;
+    params.mParallelReplicaWriteFlag = mConfig.getValue(
+        "client.parallelReplicaWrite",
+        params.mParallelReplicaWriteFlag ? 1 : 0) != 0;
     params.mMetaServerNodes = mConfig.getValue(
         KfsClient::GetMetaServerNodesParamName(), params.mMetaServerNodes);
     params.mClientRackId    = mConfig.getValue(
@@ -5648,16 +5730,20 @@ KfsClientImpl::FindFreeFileTableEntry()
 void
 KfsClientImpl::ValidateFAttrCache(time_t now, int maxScan)
 {
-    FAttr*       p;
-    const time_t expire = now - mFileAttributeRevalidateTime;
-    int          rem    = maxScan;
-    while ((p = FAttrLru::Front(mFAttrLru)) &&
-            (p->validatedTime < expire ||
-                p->generation != mFAttrCacheGeneration)) {
-        Delete(p);
-        if (--rem < 0) {
-            break;
+    FAttr* p;
+    int    rem = maxScan;
+    while ((p = FAttrLru::Front(mFAttrLru))) {
+        const bool expiredFlag =
+            (0 <= mFileAttributeRevalidateTime &&
+                p->validatedTime < now - mFileAttributeRevalidateTime);
+        if (p->generation != mFAttrCacheGeneration || expiredFlag) {
+            Delete(p);
+            if (--rem < 0) {
+                break;
+            }
+            continue;
         }
+        break;
     }
 }
 
@@ -5694,9 +5780,8 @@ KfsClientImpl::NewFAttr(kfsFileId_t parentFid, const string& name,
         mFattrCacheSkipValidateCnt = 0;
         ValidateFAttrCache(time(0), mFileAttributeRevalidateScan);
     }
-    const size_t kMaxInodeCacheSize = 16 << 10;
     for (size_t sz = mFidNameToFAttrMap.size();
-            kMaxInodeCacheSize <= sz;
+            mMaxFAttrCacheSize <= sz;
             sz--) {
         Delete(FAttrLru::Front(mFAttrLru));
     }
@@ -5861,15 +5946,47 @@ KfsClientImpl::Lookup(kfsFileId_t parentFid, const string& name,
     assert(! path.empty() && *path.begin() == '/' &&
         name != "." && name != "..");
 
+    mLookupFidNameCacheQueryCount++;
     fa = LookupFAttr(parentFid, name);
     if (fa && IsValid(*fa, now)) {
+        mLookupFidNameCacheHitCount++;
         UpdatePath(fa, path);
         return 0;
     }
+    if (fa) {
+        mLookupFidNameCacheStaleCount++;
+    } else {
+        mLookupFidNameCacheMissCount++;
+    }
+    mLookupRpcCount++;
     LookupOp op(0, parentFid, name.c_str());
     return LookupSelf(op, parentFid, name, fa, now, path);
 }
 
+void
+KfsClientImpl::CacheCreatedEntry(
+    kfsFileId_t            parentFid,
+    const string&          name,
+    const string&          fullPath,
+    kfsFileId_t            fileId,
+    const Permissions&     perms,
+    bool                   isDirectory)
+{
+    if (fileId < 0 || fullPath.empty() || fullPath[0] != '/') {
+        return;
+    }
+    FileAttr attr;
+    attr.fileId       = fileId;
+    attr.isDirectory  = isDirectory;
+    attr.user         = perms.user;
+    attr.group        = perms.group;
+    attr.mode         = perms.mode;
+    attr.Init(isDirectory);
+    FAttr* fa = 0;
+    const time_t now = time(0);
+    (void)UpdateFattr(parentFid, name, fa, fullPath, attr, now);
+}
+
 int
 KfsClientImpl::LookupSelf(LookupOp& op,
     kfsFileId_t parentFid, const string& name,
@@ -5991,6 +6108,32 @@ KfsClientImpl::GetPathComponents(const char* pathname, kfsFileId_t* parentFid,
         if (! followSymLinkFlag && lastFlag && noCheckLastDirFlag) {
             break;
         }
+        mLookupPathCacheQueryCount++;
+        fa = LookupFAttr(npath, static_cast<string*>(0));
+        if (fa && IsValid(*fa, now)) {
+            mLookupPathCacheHitCount++;
+            if (! fa->isDirectory) {
+                if (lastFlag && noCheckLastDirFlag) {
+                    break;
+                }
+                res = -ENOTDIR;
+                break;
+            }
+            if (invalidateSubCountsFlag) {
+                fa->staleSubCountsFlag = true;
+            }
+            *parentFid = fa->fileId;
+            if (lastFlag) {
+                break;
+            }
+            mTmpPath.push_back(make_pair(*parentFid, i));
+            continue;
+        }
+        if (fa) {
+            mLookupPathCacheStaleCount++;
+        } else {
+            mLookupPathCacheMissCount++;
+        }
         fa = 0;
         if ((res = Lookup(*parentFid, name, fa, now, npath)) != 0) {
             if (lastFlag && -ENOENT == res && noCheckLastDirFlag) {
@@ -7809,12 +7952,65 @@ KfsClientImpl::GetStats()
 {
     QCStMutexLocker l(mMutex);
     StartProtocolWorker();
-    Properties stats = mProtocolWorker->GetStats();
-    if (stats.empty()) {
-        return 0;
+    Properties workerStats = mProtocolWorker->GetStats();
+    Properties* const ret  = new Properties();
+    if (! workerStats.empty()) {
+        ret->swap(workerStats);
+    }
+    const int64_t lookupTotal = mLookupRpcCount +
+        mLookupPathCacheHitCount + mLookupFidNameCacheHitCount;
+    string val;
+    AppendDecIntToString(val, mLookupRpcCount);
+    ret->setValue("PathCache.LookupRpc", val);
+    val.clear();
+    AppendDecIntToString(val, mLookupPathCacheQueryCount);
+    ret->setValue("PathCache.PathQuery", val);
+    val.clear();
+    AppendDecIntToString(val, mLookupPathCacheHitCount);
+    ret->setValue("PathCache.PathHit", val);
+    val.clear();
+    AppendDecIntToString(val, mLookupPathCacheStaleCount);
+    ret->setValue("PathCache.PathStale", val);
+    val.clear();
+    AppendDecIntToString(val, mLookupPathCacheMissCount);
+    ret->setValue("PathCache.PathMiss", val);
+    val.clear();
+    AppendDecIntToString(val, mLookupFidNameCacheQueryCount);
+    ret->setValue("PathCache.FidNameQuery", val);
+    val.clear();
+    AppendDecIntToString(val, mLookupFidNameCacheHitCount);
+    ret->setValue("PathCache.FidNameHit", val);
+    val.clear();
+    AppendDecIntToString(val, mLookupFidNameCacheStaleCount);
+    ret->setValue("PathCache.FidNameStale", val);
+    val.clear();
+    AppendDecIntToString(val, mLookupFidNameCacheMissCount);
+    ret->setValue("PathCache.FidNameMiss", val);
+    val.clear();
+    AppendDecIntToString(val, lookupTotal);
+    ret->setValue("PathCache.LookupTotal", val);
+    val.clear();
+    AppendDecIntToString(val, (int64_t)mFidNameToFAttrMap.size());
+    ret->setValue("PathCache.FattrEntries", val);
+    val.clear();
+    AppendDecIntToString(val, (int64_t)mPathCache.size());
+    ret->setValue("PathCache.PathEntries", val);
+    val.clear();
+    AppendDecIntToString(val, (int64_t)mMaxFAttrCacheSize);
+    ret->setValue("PathCache.MaxEntries", val);
+    val.clear();
+    AppendDecIntToString(val, (int64_t)mFAttrCacheGeneration);
+    ret->setValue("PathCache.Generation", val);
+    val.clear();
+    AppendDecIntToString(val, (int64_t)mFileAttributeRevalidateTime);
+    ret->setValue("PathCache.RevalidateSec", val);
+    if (0 < lookupTotal) {
+        char buf[64];
+        const double ratio = (double)(mLookupPathCacheHitCount +
+            mLookupFidNameCacheHitCount) / (double)lookupTotal;
+        snprintf(buf, sizeof(buf), "%.6f", ratio);
+        ret->setValue("PathCache.HitRatio", buf);
     }
-    Properties* const ret = new Properties();
-    ret->swap(stats);
     return ret;
 }
 
diff --git a/src/cc/libclient/KfsClientInt.h b/src/cc/libclient/KfsClientInt.h
index e50acfc85..17aec32db 100644
--- a/src/cc/libclient/KfsClientInt.h
+++ b/src/cc/libclient/KfsClientInt.h
@@ -785,6 +785,16 @@ class KfsClientImpl : private KfsNetClient::OpOwner
     int                            mFileAttributeRevalidateTime;
     unsigned int                   mFileAttributeRevalidateScan;
     unsigned int                   mFAttrCacheGeneration;
+    size_t                         mMaxFAttrCacheSize;
+    int64_t                        mLookupRpcCount;
+    int64_t                        mLookupPathCacheQueryCount;
+    int64_t                        mLookupPathCacheHitCount;
+    int64_t                        mLookupPathCacheStaleCount;
+    int64_t                        mLookupPathCacheMissCount;
+    int64_t                        mLookupFidNameCacheQueryCount;
+    int64_t                        mLookupFidNameCacheHitCount;
+    int64_t                        mLookupFidNameCacheStaleCount;
+    int64_t                        mLookupFidNameCacheMissCount;
     TmpPath                        mTmpPath;
     string                         mTmpAbsPathStr;
     Path                           mTmpAbsPath;
@@ -855,7 +865,8 @@ class KfsClientImpl : private KfsNetClient::OpOwner
     bool IsValid(const FAttr& fa, time_t now) const
     {
         return (fa.generation == mFAttrCacheGeneration &&
-            now <= fa.validatedTime + mFileAttributeRevalidateTime);
+            (mFileAttributeRevalidateTime < 0 ||
+                now <= fa.validatedTime + mFileAttributeRevalidateTime));
     }
 
     void Shutdown();
@@ -887,6 +898,11 @@ class KfsClientImpl : private KfsNetClient::OpOwner
     int CreateSelf(const char *pathname, int numReplicas, bool exclusive,
         int numStripes, int numRecoveryStripes, int stripeSize, int stripedType,
         bool forceTypeFlag, kfsMode_t mode, kfsSTier_t minSTier, kfsSTier_t maxSTier);
+    int CreateSelfResolved(const char *pathname, kfsFileId_t parentFid,
+        const string& filename, const string& path, int numReplicas,
+        bool exclusive, int numStripes, int numRecoveryStripes,
+        int stripeSize, int stripedType, bool forceTypeFlag, kfsMode_t mode,
+        kfsSTier_t minSTier, kfsSTier_t maxSTier);
     ssize_t SetReadAheadSize(FileTableEntry& inEntry, size_t inSize, bool optimalFlag = false);
     ssize_t SetIoBufferSize(FileTableEntry& entry, size_t size, bool optimalFlag = false);
     ssize_t SetOptimalIoBufferSize(FileTableEntry& entry, size_t size) {
@@ -978,6 +994,13 @@ class KfsClientImpl : private KfsNetClient::OpOwner
         kfsFileId_t parentFid, const string& name, FAttr*& fa,
         time_t now, const string& path);
     FAttr* LookupFattr(kfsFileId_t parentFid, const string& name);
+    void CacheCreatedEntry(
+        kfsFileId_t            parentFid,
+        const string&          name,
+        const string&          fullPath,
+        kfsFileId_t            fileId,
+        const Permissions&     perms,
+        bool                   isDirectory);
 
     // name -- is the last component of the pathname
     int AllocFileTableEntry(
diff --git a/src/cc/libclient/KfsOps.cc b/src/cc/libclient/KfsOps.cc
index 66a7a0b3b..dc63c0dd8 100644
--- a/src/cc/libclient/KfsOps.cc
+++ b/src/cc/libclient/KfsOps.cc
@@ -614,6 +614,9 @@ CloseOp::Request(ReqOstream& os)
             chunkVersion << "\r\n"
         << Access()
     ;
+    if (noForwardFlag) {
+        os << (shortRpcFormatFlag ? "NF:1\r\n" : "No-forward: 1\r\n");
+    }
     if (! writeInfo.empty()) {
         os <<
             (shortRpcFormatFlag ? "W:1\r\n" : "Has-write-id: 1\r\n") <<
@@ -663,12 +666,22 @@ WriteIdAllocOp::Request(ReqOstream& os)
 {
     os <<
     "WRITE_ID_ALLOC\r\n"  << ReqHeaders(*this)           <<
-    (shortRpcFormatFlag ? "H:" : "Chunk-handle: ")  << chunkId      << "\r\n" <<
+    (shortRpcFormatFlag ? "H:" : "Chunk-handle: ")  << chunkId      << "\r\n";
+    if (fileId >= 0) {
+        os << (shortRpcFormatFlag ? "P:" : "File-handle: ") <<
+            fileId << "\r\n";
+    }
+    if (leaseId >= 0) {
+        os << (shortRpcFormatFlag ? "L:" : "Lease-id: ") <<
+            leaseId << "\r\n";
+    }
+    os <<
     (shortRpcFormatFlag ? "V:" : "Chunk-version: ") << chunkVersion << "\r\n" <<
     (shortRpcFormatFlag ? "O:" : "Offset: ")        << offset       << "\r\n" <<
     (shortRpcFormatFlag ? "B:" : "Num-bytes: ")     << numBytes     << "\r\n" <<
     (shortRpcFormatFlag ? "A:" : "For-record-append: ") <<
         (isForRecordAppend ? 1 : 0) << "\r\n" <<
+    (noForwardFlag ? (shortRpcFormatFlag ? "NF:1\r\n" : "No-forward: 1\r\n") : "") <<
     (shortRpcFormatFlag ? "R:" : "Num-servers: ") <<
         chunkServerLoc.size() << "\r\n" <<
     Access() <<
@@ -737,10 +750,16 @@ WritePrepareOp::Request(ReqOstream& os)
     (shortRpcFormatFlag ? "K:"  : "Checksum: ")     << checksum     << "\r\n" <<
     Access()
     ;
+    if (noForwardFlag) {
+        os << (shortRpcFormatFlag ? "NF:1\r\n" : "No-forward: 1\r\n");
+    }
     if (! checksums.empty()) {
         os << (shortRpcFormatFlag ? "KC:" : "Checksum-entries: ") <<
             checksums.size()  << "\r\n" <<
             (shortRpcFormatFlag ? "Ks:" : "Checksums: ");
+        if (shortRpcFormatFlag) {
+            os << std::hex;
+        }
         for (size_t i = 0; i < checksums.size(); i++) {
             os << checksums[i] << ' ';
         }
@@ -775,8 +794,14 @@ WriteSyncOp::Request(ReqOstream& os)
         checksums.size() << "\r\n" <<
     Access()
     ;
+    if (noForwardFlag) {
+        os << (shortRpcFormatFlag ? "NF:1\r\n" : "No-forward: 1\r\n");
+    }
     if (! checksums.empty()) {
         os << (shortRpcFormatFlag ? "K:" : "Checksums: ");
+        if (shortRpcFormatFlag) {
+            os << std::hex;
+        }
         for (size_t i = 0; i < checksums.size(); i++) {
             os << checksums[i] << ' ';
         }
@@ -1413,6 +1438,8 @@ AllocateOp::ParseResponseHeaderSelf(const Properties& prop)
     if (status < 0) {
         return;
     }
+    leaseId = prop.getValue(
+        shortRpcFormatFlag ? "L" : "Lease-id", int64_t(-1));
     chunkLeaseDuration = prop.getValue(
         shortRpcFormatFlag ? "LD" : "Lease-duration", int64_t(-1));
     if (ParseChunkServerAccess(*this, prop.getValue(
diff --git a/src/cc/libclient/KfsOps.h b/src/cc/libclient/KfsOps.h
index a89a73c9e..fcd5270a0 100644
--- a/src/cc/libclient/KfsOps.h
+++ b/src/cc/libclient/KfsOps.h
@@ -1043,6 +1043,7 @@ struct AllocateOp : public KfsOp {
     bool                   invalidateAllFlag;
     bool                   allowCSClearTextFlag;
     bool                   allCSShortRpcFlag;
+    int64_t                leaseId;
     int64_t                chunkLeaseDuration;
     int64_t                chunkServerAccessValidForTime;
     int64_t                chunkServerAccessIssuedTime;
@@ -1063,6 +1064,7 @@ struct AllocateOp : public KfsOp {
           invalidateAllFlag(false),
           allowCSClearTextFlag(false),
           allCSShortRpcFlag(false),
+          leaseId(-1),
           chunkLeaseDuration(-1),
           chunkServerAccessValidForTime(0),
           chunkServerAccessIssuedTime(0),
@@ -1084,6 +1086,7 @@ struct AllocateOp : public KfsOp {
         invalidateAllFlag             = false;
         allowCSClearTextFlag          = false;
         allCSShortRpcFlag             = false;
+        leaseId                       = -1;
         chunkLeaseDuration            = -1;
         chunkServerAccessValidForTime = 0;
         chunkServerAccessIssuedTime   = 0;
@@ -1188,14 +1191,17 @@ class ShowWriteInfo {
 struct CloseOp : public ChunkAccessOp {
     vector<ServerLocation> chunkServerLoc;
     vector<WriteInfo>      writeInfo;
+    bool                   noForwardFlag;
 
     CloseOp(kfsSeq_t s, kfsChunkId_t c)
         : ChunkAccessOp(CMD_CLOSE, s, c),
-          writeInfo()
+          writeInfo(),
+          noForwardFlag(false)
         {}
     CloseOp(kfsSeq_t s, kfsChunkId_t c, const vector<WriteInfo>& wi)
         : ChunkAccessOp(CMD_CLOSE, s, c),
-          writeInfo(wi)
+          writeInfo(wi),
+          noForwardFlag(false)
         {}
     void Request(ReqOstream& os);
     virtual ostream& ShowSelf(ostream& os) const {
@@ -1262,19 +1268,25 @@ struct ReadOp : public ChunkAccessOp {
 
 // op that defines the write that is going to happen
 struct WriteIdAllocOp : public ChunkAccessOp {
+    kfsFileId_t  fileId;      /* input, optional for lazy chunk create */
+    int64_t      leaseId;     /* input, optional for lazy chunk create */
     chunkOff_t   offset;       /* input */
     size_t       numBytes;     /* input */
     bool         isForRecordAppend; /* set if this is for a record append that is coming */
     bool         writePrepReplySupportedFlag;
+    bool         noForwardFlag;
     string       writeIdStr;   /* output */
     vector<ServerLocation> chunkServerLoc;
 
     WriteIdAllocOp(kfsSeq_t s, kfsChunkId_t c, int64_t v, chunkOff_t o, size_t n)
         : ChunkAccessOp(CMD_WRITE_ID_ALLOC, s, c),
+          fileId(-1),
+          leaseId(-1),
           offset(o),
           numBytes(n),
           isForRecordAppend(false),
-          writePrepReplySupportedFlag(false)
+          writePrepReplySupportedFlag(false),
+          noForwardFlag(false)
         { chunkVersion = v; }
     void Request(ReqOstream& os);
     virtual void ParseResponseHeaderSelf(const Properties& prop);
@@ -1290,6 +1302,7 @@ struct WritePrepareOp : public ChunkAccessOp {
     chunkOff_t        offset;       /* input */
     size_t            numBytes;     /* input */
     bool              replyRequestedFlag;
+    bool              noForwardFlag;
     vector<uint32_t>  checksums;    /* checksum for each 64KB block */
     vector<WriteInfo> writeInfo;    /* input */
 
@@ -1298,6 +1311,7 @@ struct WritePrepareOp : public ChunkAccessOp {
           offset(0),
           numBytes(0),
           replyRequestedFlag(false),
+          noForwardFlag(false),
           checksums(),
           writeInfo()
         { chunkVersion = v; }
@@ -1319,6 +1333,7 @@ struct WriteSyncOp : public ChunkAccessOp {
     chunkOff_t        offset; /* input */
     size_t            numBytes; /* input */
     vector<WriteInfo> writeInfo;
+    bool              noForwardFlag;
     // The checksums that cover the region.
     vector<uint32_t>  checksums;
 
@@ -1326,7 +1341,8 @@ struct WriteSyncOp : public ChunkAccessOp {
         : ChunkAccessOp(CMD_WRITE_SYNC, 0, 0),
           offset(0),
           numBytes(0),
-          writeInfo()
+          writeInfo(),
+          noForwardFlag(false)
         {}
     void Request(ReqOstream& os);
     virtual ostream& ShowSelf(ostream& os) const {
diff --git a/src/cc/libclient/KfsProtocolWorker.cc b/src/cc/libclient/KfsProtocolWorker.cc
index 4752048f1..07eb3da62 100644
--- a/src/cc/libclient/KfsProtocolWorker.cc
+++ b/src/cc/libclient/KfsProtocolWorker.cc
@@ -138,6 +138,7 @@ class KfsProtocolWorker::Impl :
           mStopRequest(),
           mWorker(this, "KfsProtocolWorker"),
           mMutex(),
+          mParallelReplicaWriteFlag(inParameters.mParallelReplicaWriteFlag),
           mClientPoolPtr(inParameters.mUseClientPoolFlag ?
             new ClientPool(
                 mNetManager,
@@ -154,7 +155,7 @@ class KfsProtocolWorker::Impl :
                     int64_t(std::numeric_limits<int>::max())
                 ), // inMaxContentLength
                 false,                       // inFailAllOpsOnOpTimeoutFlag
-                false,                       // inMaxOneOutstandingOpFlag
+                true,                        // inMaxOneOutstandingOpFlag
                 0                            // inAuthContextPtr
             ) : 0
         ),
@@ -1149,7 +1150,9 @@ class KfsProtocolWorker::Impl :
                 min(max(4 << 20, inOwner.mMaxWriteSize),
                     max(inOwner.mMaxWriteSize, inMaxWriteSize)),
                 inLogPrefixPtr,
-                inOwner.mChunkServerInitialSeqNum
+                inOwner.mChunkServerInitialSeqNum,
+                inOwner.mClientPoolPtr,
+                inOwner.mParallelReplicaWriteFlag
               ),
               mCurRequestPtr(0),
               mAsyncStatus(0)
@@ -1716,6 +1719,7 @@ class KfsProtocolWorker::Impl :
     StopRequest          mStopRequest;
     QCThread             mWorker;
     QCMutex              mMutex;
+    const bool            mParallelReplicaWriteFlag;
     ClientPool* const    mClientPoolPtr;
     FileReader::Stats    mReadStats;
     FileWriter::Stats    mWriteStats;
diff --git a/src/cc/libclient/KfsProtocolWorker.h b/src/cc/libclient/KfsProtocolWorker.h
index 6b8165eb3..9a9784e37 100644
--- a/src/cc/libclient/KfsProtocolWorker.h
+++ b/src/cc/libclient/KfsProtocolWorker.h
@@ -214,7 +214,8 @@ class KfsProtocolWorker
             bool               inResolverUseOsResolverFlag   = false,
             int                inResolverCacheSize           = 8 << 10,
             int                inResolverCacheExpiration     = -1,
-            const string&      inNodeId                      = string())
+            const string&      inNodeId                      = string(),
+            bool               inParallelReplicaWriteFlag    = true)
             : mMetaMaxRetryCount(inMetaMaxRetryCount),
               mMetaTimeSecBetweenRetries(inMetaTimeSecBetweenRetries),
               mMetaOpTimeoutSec(inMetaOpTimeoutSec),
@@ -244,7 +245,8 @@ class KfsProtocolWorker
               mResolverUseOsResolverFlag(inResolverUseOsResolverFlag),
               mResolverCacheSize(inResolverCacheSize),
               mResolverCacheExpiration(inResolverCacheExpiration),
-              mNodeId(inNodeId)
+              mNodeId(inNodeId),
+              mParallelReplicaWriteFlag(inParallelReplicaWriteFlag)
             {}
             int                 mMetaMaxRetryCount;
             int                 mMetaTimeSecBetweenRetries;
@@ -276,6 +278,7 @@ class KfsProtocolWorker
             int                 mResolverCacheSize;
             int                 mResolverCacheExpiration;
             string              mNodeId;
+            bool                mParallelReplicaWriteFlag;
     };
     KfsProtocolWorker(
         std::string       inMetaHost,
diff --git a/src/cc/libclient/WriteAppender.cc b/src/cc/libclient/WriteAppender.cc
index 564d72150..4dd35a064 100644
--- a/src/cc/libclient/WriteAppender.cc
+++ b/src/cc/libclient/WriteAppender.cc
@@ -1041,6 +1041,8 @@ class WriteAppender::Impl : private ITimeout, private KfsNetClient::OpOwner
         QCASSERT(mAllocOp.chunkId > 0 && ! mAllocOp.chunkServers.empty());
         Reset(mWriteIdAllocOp);
         mWriteIdAllocOp.chunkId           = mAllocOp.chunkId;
+        mWriteIdAllocOp.fileId            = mAllocOp.fid;
+        mWriteIdAllocOp.leaseId          = mAllocOp.leaseId;
         mWriteIdAllocOp.chunkVersion      = mAllocOp.chunkVersion;
         mWriteIdAllocOp.isForRecordAppend = true;
         mWriteIdAllocOp.chunkServerLoc    = mAllocOp.chunkServers;
diff --git a/src/cc/libclient/Writer.cc b/src/cc/libclient/Writer.cc
index e4a8d6d75..17cadd48f 100644
--- a/src/cc/libclient/Writer.cc
+++ b/src/cc/libclient/Writer.cc
@@ -31,6 +31,8 @@
 #include <sstream>
 #include <bitset>
 #include <string.h>
+#include <vector>
+#include <sys/time.h>
 
 #include "kfsio/IOBuffer.h"
 #include "kfsio/NetManager.h"
@@ -47,6 +49,7 @@
 #include "KfsOps.h"
 #include "KfsClient.h"
 #include "Monitor.h"
+#include "ClientPool.h"
 
 namespace KFS
 {
@@ -59,6 +62,13 @@ using std::string;
 using std::ostream;
 using std::ostringstream;
 
+static int64_t WriterNowUsec()
+{
+    struct timeval tv;
+    gettimeofday(&tv, 0);
+    return int64_t(tv.tv_sec) * 1000000 + int64_t(tv.tv_usec);
+}
+
 // Kfs client write state machine implementation.
 class Writer::Impl :
     public QCRefCountedObj,
@@ -92,7 +102,9 @@ class Writer::Impl :
         int           inIdleTimeoutSec,
         int           inMaxWriteSize,
         const string& inLogPrefix,
-        int64_t       inChunkServerInitialSeqNum)
+        int64_t       inChunkServerInitialSeqNum,
+        ClientPool*   inClientPoolPtr,
+        bool          inParallelReplicaWriteFlag)
         : QCRefCountedObj(),
           ITimeout(),
           KfsNetClient::OpOwner(),
@@ -121,6 +133,8 @@ class Writer::Impl :
           mOffset(0),
           mOpenChunkBlockSize(CHUNKSIZE),
           mChunkServerInitialSeqNum(inChunkServerInitialSeqNum),
+          mClientPoolPtr(inClientPoolPtr),
+          mParallelReplicaWriteFlag(inParallelReplicaWriteFlag),
           mCompletionPtr(inCompletionPtr),
           mBuffer(),
           mLogPrefix(inLogPrefix),
@@ -130,7 +144,9 @@ class Writer::Impl :
           mOpStartTime(0),
           mCompletionDepthCount(0),
           mStriperProcessCount(0),
-          mStriperPtr(0)
+          mStriperPtr(0),
+          mCloseStartUsec(0),
+          mSetSizeStartUsec(0)
         { Writers::Init(mWriters); }
     int Open(
         kfsFileId_t inFileId,
@@ -214,6 +230,7 @@ class Writer::Impl :
             return kErrorTryAgain;
         }
         mClosingFlag = true;
+        mCloseStartUsec = WriterNowUsec();
         return StartWrite();
     }
     Offset Write(
@@ -381,7 +398,29 @@ class Writer::Impl :
             size_t         mBeginBlock;
             size_t         mEndBlock;
             time_t         mOpStartTime;
+            int64_t        mEnqueueUsec;
             bool           mChecksumValidFlag;
+            struct ParallelReplica
+            {
+                WritePrepareOp mPrepareOp;
+                IOBuffer       mBuffer;
+                KfsNetClient*  mClientPtr;
+                bool           mDoneFlag;
+                ParallelReplica()
+                    : mPrepareOp(0, 0, 0),
+                      mBuffer(),
+                      mClientPtr(0),
+                      mDoneFlag(false)
+                    {}
+            private:
+                ParallelReplica(const ParallelReplica&);
+                ParallelReplica& operator=(const ParallelReplica&);
+            };
+            typedef std::vector<ParallelReplica*> ParallelReplicas;
+            ParallelReplicas mParallelReplicas;
+            int            mParallelDoneCount;
+            int            mParallelStatus;
+            string         mParallelStatusMsg;
             WriteOp*       mPrevPtr[1];
             WriteOp*       mNextPtr[1];
 
@@ -393,8 +432,25 @@ class Writer::Impl :
                   mBeginBlock(0),
                   mEndBlock(0),
                   mOpStartTime(0),
-                  mChecksumValidFlag(false)
+                  mEnqueueUsec(0),
+                  mChecksumValidFlag(false),
+                  mParallelReplicas(),
+                  mParallelDoneCount(0),
+                  mParallelStatus(0),
+                  mParallelStatusMsg()
                 { Queue::Init(*this); }
+            void ClearParallelReplicas()
+            {
+                for (ParallelReplicas::iterator it = mParallelReplicas.begin();
+                        it != mParallelReplicas.end();
+                        ++it) {
+                    delete *it;
+                }
+                mParallelReplicas.clear();
+                mParallelDoneCount = 0;
+                mParallelStatus = 0;
+                mParallelStatusMsg.clear();
+            }
             void Delete(
                 WriteOp** inListPtr)
             {
@@ -471,7 +527,7 @@ class Writer::Impl :
             }
         private:
             virtual ~WriteOp()
-                {}
+                { ClearParallelReplicas(); }
             WriteOp(
                 const WriteOp& inWriteOp);
             WriteOp& operator=(
@@ -499,6 +555,7 @@ class Writer::Impl :
                 // cancel all pending ops by calling Stop()
                 false // inResetConnectionOnOpTimeoutFlag
               ),
+              mChunkServerPtr(0),
               mErrorCode(0),
               mRetryCount(0),
               mPendingCount(0),
@@ -509,6 +566,16 @@ class Writer::Impl :
               mAllocOp(0, 0, ""),
               mWriteIdAllocOp(0, 0, 0, 0, 0),
               mCloseOp(0, 0),
+              mParallelWriteIdReplicas(),
+              mParallelWriteIdDoneCount(0),
+              mParallelWriteIdStatus(0),
+              mParallelWriteIdStatusMsg(),
+              mParallelWriteIdStr(),
+              mParallelWritePrepReplySupportedFlag(true),
+              mParallelCloseReplicas(),
+              mParallelCloseDoneCount(0),
+              mParallelCloseStatus(0),
+              mParallelCloseStatusMsg(),
               mLastOpPtr(0),
               mSleepingFlag(false),
               mClosingFlag(false),
@@ -524,7 +591,10 @@ class Writer::Impl :
               mChunkAccessExpireTime(0),
               mCSAccessExpireTime(0),
               mUpdateLeaseOp(0, -1, 0),
-              mSleepTimer(inOuter.mNetManager, *this)
+              mSleepTimer(inOuter.mNetManager, *this),
+              mChunkCloseStartUsec(0),
+              mWriteIdAllocStartUsec(0),
+              mAllocateStartUsec(0)
         {
             SET_HANDLER(this, &ChunkWriter::EventHandler);
             Queue::Init(mPendingQueue);
@@ -693,7 +763,7 @@ class Writer::Impl :
                     }
                     return;
                 }
-                mChunkServer.Stop();
+                StopChunkServer();
                 if (mLastOpPtr == &mAllocOp) {
                     mOuter.mMetaServer.Cancel(mLastOpPtr, this);
                 }
@@ -713,7 +783,7 @@ class Writer::Impl :
                 // Start from the beginning -- chunk allocation.
                 KFS_LOG_STREAM_DEBUG << mLogPrefix <<
                     "write lease expired: " <<
-                        mChunkServer.GetServerLocation() <<
+                        GetChunkServer().GetServerLocation() <<
                     " starting from chunk allocation, pending:" <<
                     " queue: " << (Queue::IsEmpty(mPendingQueue) ? "" : "not") <<
                         " empty" <<
@@ -798,9 +868,40 @@ class Writer::Impl :
         typedef std::bitset<CHUNKSIZE / CHECKSUM_BLOCKSIZE> ChecksumBlocks;
         typedef NetManager::Timer                           Timer;
         enum { kLeaseRenewTime = LEASE_INTERVAL_SECS / 3 };
+        struct ParallelWriteIdReplica
+        {
+            WriteIdAllocOp mOp;
+            KfsNetClient*  mClientPtr;
+            bool           mDoneFlag;
+            ParallelWriteIdReplica()
+                : mOp(0, 0, 0, 0, 0),
+                  mClientPtr(0),
+                  mDoneFlag(false)
+                {}
+        private:
+            ParallelWriteIdReplica(const ParallelWriteIdReplica&);
+            ParallelWriteIdReplica& operator=(const ParallelWriteIdReplica&);
+        };
+        typedef std::vector<ParallelWriteIdReplica*> ParallelWriteIdReplicas;
+        struct ParallelCloseReplica
+        {
+            CloseOp       mOp;
+            KfsNetClient* mClientPtr;
+            bool          mDoneFlag;
+            ParallelCloseReplica()
+                : mOp(0, 0),
+                  mClientPtr(0),
+                  mDoneFlag(false)
+                {}
+        private:
+            ParallelCloseReplica(const ParallelCloseReplica&);
+            ParallelCloseReplica& operator=(const ParallelCloseReplica&);
+        };
+        typedef std::vector<ParallelCloseReplica*> ParallelCloseReplicas;
 
         Impl&          mOuter;
         ChunkServer    mChunkServer;
+        ChunkServer*   mChunkServerPtr;
         int            mErrorCode;
         int            mRetryCount;
         Offset         mPendingCount;
@@ -811,6 +912,16 @@ class Writer::Impl :
         AllocateOp     mAllocOp;
         WriteIdAllocOp mWriteIdAllocOp;
         CloseOp        mCloseOp;
+        ParallelWriteIdReplicas mParallelWriteIdReplicas;
+        int            mParallelWriteIdDoneCount;
+        int            mParallelWriteIdStatus;
+        string         mParallelWriteIdStatusMsg;
+        string         mParallelWriteIdStr;
+        bool           mParallelWritePrepReplySupportedFlag;
+        ParallelCloseReplicas mParallelCloseReplicas;
+        int            mParallelCloseDoneCount;
+        int            mParallelCloseStatus;
+        string         mParallelCloseStatusMsg;
         KfsOp*         mLastOpPtr;
         bool           mSleepingFlag;
         bool           mClosingFlag;
@@ -829,12 +940,43 @@ class Writer::Impl :
         Timer          mSleepTimer;
         WriteOp*       mPendingQueue[1];
         WriteOp*       mInFlightQueue[1];
+        int64_t        mChunkCloseStartUsec;
+        int64_t        mWriteIdAllocStartUsec;
+        int64_t        mAllocateStartUsec;
         ChunkWriter*   mPrevPtr[1];
         ChunkWriter*   mNextPtr[1];
 
         friend class QCDLListOp<ChunkWriter, 0>;
         typedef QCDLListOp<ChunkWriter, 0> ChunkWritersListOp;
 
+        void ClearParallelWriteIdReplicas()
+        {
+            for (ParallelWriteIdReplicas::iterator it =
+                        mParallelWriteIdReplicas.begin();
+                    it != mParallelWriteIdReplicas.end();
+                    ++it) {
+                delete *it;
+            }
+            mParallelWriteIdReplicas.clear();
+            mParallelWriteIdDoneCount = 0;
+            mParallelWriteIdStatus = 0;
+            mParallelWriteIdStatusMsg.clear();
+            mParallelWriteIdStr.clear();
+            mParallelWritePrepReplySupportedFlag = true;
+        }
+        void ClearParallelCloseReplicas()
+        {
+            for (ParallelCloseReplicas::iterator it =
+                        mParallelCloseReplicas.begin();
+                    it != mParallelCloseReplicas.end();
+                    ++it) {
+                delete *it;
+            }
+            mParallelCloseReplicas.clear();
+            mParallelCloseDoneCount = 0;
+            mParallelCloseStatus = 0;
+            mParallelCloseStatusMsg.clear();
+        }
         void UpdateLeaseExpirationTime()
         {
             mLeaseExpireTime = min(mLeaseEndTime,
@@ -875,6 +1017,7 @@ class Writer::Impl :
             mOuter.mStats.mChunkAllocCount++;
             // Use 5x chunk op timeout for "allocation" that can require
             // chunk version change.
+            mAllocateStartUsec = WriterNowUsec();
             const int theMetaOpTimeout = mOuter.mMetaServer.GetOpTimeoutSec();
             EnqueueMeta(mAllocOp, 0, max(0, max(mOuter.mOpTimeoutSec,
                     5 * theMetaOpTimeout) - theMetaOpTimeout));
@@ -885,6 +1028,12 @@ class Writer::Impl :
             IOBuffer*   inBufferPtr)
         {
             QCASSERT(&mAllocOp == &inOp && ! inBufferPtr);
+            if (0 < mAllocateStartUsec) {
+                mOuter.mStats.mAllocateUsec +=
+                    WriterNowUsec() - mAllocateStartUsec;
+                mOuter.mStats.mAllocateCount++;
+                mAllocateStartUsec = 0;
+            }
             if (inCanceledFlag) {
                 return;
             }
@@ -952,11 +1101,175 @@ class Writer::Impl :
                 mAllocOp.invalidateAllFlag
             );
         }
+        bool CanParallelReplicaChunkOps() const
+        {
+            return (
+                mOuter.mParallelReplicaWriteFlag &&
+                mOuter.mClientPoolPtr &&
+                mAllocOp.chunkServers.size() > 1 &&
+                mAllocOp.chunkServerAccessToken.empty() &&
+                mAllocOp.chunkAccess.empty()
+            );
+        }
+        void CopyWriteIdAllocRequest(
+            WriteIdAllocOp& outOp,
+            const WriteIdAllocOp& inOp)
+        {
+            Reset(outOp);
+            outOp.chunkId                     = inOp.chunkId;
+            outOp.fileId                      = inOp.fileId;
+            outOp.leaseId                     = inOp.leaseId;
+            outOp.chunkVersion                = inOp.chunkVersion;
+            outOp.isForRecordAppend           = inOp.isForRecordAppend;
+            outOp.chunkServerLoc              = inOp.chunkServerLoc;
+            outOp.offset                      = inOp.offset;
+            outOp.numBytes                    = inOp.numBytes;
+            outOp.writePrepReplySupportedFlag = false;
+            outOp.noForwardFlag               = true;
+        }
+        bool TryParallelWriteIdAlloc()
+        {
+            if (! CanParallelReplicaChunkOps()) {
+                return false;
+            }
+            ClearParallelWriteIdReplicas();
+            mParallelWriteIdReplicas.reserve(mAllocOp.chunkServers.size());
+            for (vector<ServerLocation>::const_iterator it =
+                        mAllocOp.chunkServers.begin();
+                    it != mAllocOp.chunkServers.end();
+                    ++it) {
+                ParallelWriteIdReplica* const theReplicaPtr =
+                    new ParallelWriteIdReplica();
+                CopyWriteIdAllocRequest(theReplicaPtr->mOp, mWriteIdAllocOp);
+                theReplicaPtr->mClientPtr = &mOuter.mClientPoolPtr->Get(
+                    *it, mAllocOp.allCSShortRpcFlag);
+                mParallelWriteIdReplicas.push_back(theReplicaPtr);
+            }
+            mLastOpPtr = &mWriteIdAllocOp;
+            mWriteIdAllocStartUsec = WriterNowUsec();
+            for (ParallelWriteIdReplicas::iterator it =
+                        mParallelWriteIdReplicas.begin();
+                    it != mParallelWriteIdReplicas.end();
+                    ++it) {
+                EnqueueParallelWriteId(**it);
+            }
+            if (mParallelWriteIdDoneCount >=
+                    (int)mParallelWriteIdReplicas.size()) {
+                DoneParallelWriteIdAlloc();
+            }
+            return true;
+        }
+        void EnqueueParallelWriteId(
+            ParallelWriteIdReplica& inReplica)
+        {
+            KFS_LOG_STREAM_DEBUG << mLogPrefix <<
+                "+> parallel " << inReplica.mOp.Show() <<
+            KFS_LOG_EOM;
+            mOuter.mStats.mChunkOpsQueuedCount++;
+            if (! inReplica.mClientPtr->Enqueue(&inReplica.mOp, this, 0)) {
+                inReplica.mOp.status = kErrorFault;
+                inReplica.mDoneFlag = true;
+                mParallelWriteIdDoneCount++;
+                if (mParallelWriteIdStatus == 0) {
+                    mParallelWriteIdStatus = kErrorFault;
+                    mParallelWriteIdStatusMsg =
+                        "parallel write id enqueue failure";
+                }
+            }
+        }
+        bool DoneParallelWriteIdAlloc(
+            KfsOp*    inOpPtr,
+            bool      inCanceledFlag,
+            IOBuffer* inBufferPtr)
+        {
+            if (mParallelWriteIdReplicas.empty()) {
+                return false;
+            }
+            for (ParallelWriteIdReplicas::iterator it =
+                        mParallelWriteIdReplicas.begin();
+                    it != mParallelWriteIdReplicas.end();
+                    ++it) {
+                if (&(*it)->mOp == inOpPtr) {
+                    return DoneParallelWriteIdAlloc(
+                        **it, inCanceledFlag, inBufferPtr);
+                }
+            }
+            return false;
+        }
+        bool DoneParallelWriteIdAlloc(
+            ParallelWriteIdReplica& inReplica,
+            bool                    inCanceledFlag,
+            IOBuffer*               inBufferPtr)
+        {
+            QCASSERT(! inBufferPtr);
+            if (inReplica.mDoneFlag) {
+                return false;
+            }
+            inReplica.mDoneFlag = true;
+            mParallelWriteIdDoneCount++;
+            if ((inCanceledFlag || inReplica.mOp.status < 0) &&
+                    mParallelWriteIdStatus == 0) {
+                mParallelWriteIdStatus =
+                    inCanceledFlag ? kErrorIo : inReplica.mOp.status;
+                mParallelWriteIdStatusMsg =
+                    inCanceledFlag ? "parallel write id canceled" :
+                        inReplica.mOp.statusMsg;
+            } else if (inReplica.mOp.status == 0) {
+                if (mWriteIdAllocOp.chunkAccessResponse.empty()) {
+                    mWriteIdAllocOp.chunkAccessResponse =
+                        inReplica.mOp.chunkAccessResponse;
+                    mWriteIdAllocOp.chunkServerAccessId =
+                        inReplica.mOp.chunkServerAccessId;
+                    mWriteIdAllocOp.chunkServerAccessKey =
+                        inReplica.mOp.chunkServerAccessKey;
+                    mWriteIdAllocOp.accessResponseIssued =
+                        inReplica.mOp.accessResponseIssued;
+                    mWriteIdAllocOp.accessResponseValidForSec =
+                        inReplica.mOp.accessResponseValidForSec;
+                }
+            }
+            if (mParallelWriteIdDoneCount <
+                    (int)mParallelWriteIdReplicas.size()) {
+                return true;
+            }
+            DoneParallelWriteIdAlloc();
+            return true;
+        }
+        void DoneParallelWriteIdAlloc()
+        {
+            if (mLastOpPtr == &mWriteIdAllocOp) {
+                mLastOpPtr = 0;
+            }
+            mWriteIdAllocOp.shortRpcFormatFlag = mAllocOp.allCSShortRpcFlag;
+            mWriteIdAllocOp.status = mParallelWriteIdStatus;
+            mWriteIdAllocOp.statusMsg = mParallelWriteIdStatusMsg;
+            mParallelWriteIdStr.clear();
+            mParallelWritePrepReplySupportedFlag = true;
+            for (ParallelWriteIdReplicas::const_iterator it =
+                        mParallelWriteIdReplicas.begin();
+                    it != mParallelWriteIdReplicas.end();
+                    ++it) {
+                if (! mParallelWriteIdStr.empty()) {
+                    mParallelWriteIdStr.append(" ");
+                }
+                mParallelWriteIdStr.append((*it)->mOp.writeIdStr);
+                mParallelWritePrepReplySupportedFlag =
+                    mParallelWritePrepReplySupportedFlag &&
+                    (*it)->mOp.writePrepReplySupportedFlag;
+            }
+            mWriteIdAllocOp.writeIdStr = mParallelWriteIdStr;
+            mWriteIdAllocOp.writePrepReplySupportedFlag =
+                mParallelWritePrepReplySupportedFlag;
+            Done(mWriteIdAllocOp, false, 0);
+        }
         void AllocateWriteId()
         {
             QCASSERT(mAllocOp.chunkId > 0 && ! mAllocOp.chunkServers.empty());
             Reset(mWriteIdAllocOp);
+            ClearParallelWriteIdReplicas();
             mWriteIdAllocOp.chunkId                     = mAllocOp.chunkId;
+            mWriteIdAllocOp.fileId                      = mOuter.mFileId;
+            mWriteIdAllocOp.leaseId                    = mAllocOp.leaseId;
             mWriteIdAllocOp.chunkVersion                = mAllocOp.chunkVersion;
             mWriteIdAllocOp.isForRecordAppend           = false;
             mWriteIdAllocOp.chunkServerLoc              = mAllocOp.chunkServers;
@@ -964,22 +1277,34 @@ class Writer::Impl :
             mWriteIdAllocOp.numBytes                    = 0;
             mWriteIdAllocOp.writePrepReplySupportedFlag = false;
 
+            const ServerLocation& theMaster = mAllocOp.chunkServers.front();
+            if (mOuter.mClientPoolPtr) {
+                mChunkServerPtr = &mOuter.mClientPoolPtr->Get(
+                    theMaster, mAllocOp.allCSShortRpcFlag);
+            } else {
+                mChunkServerPtr = 0;
+                const ServerLocation theCurLoc = mChunkServer.GetServerLocation();
+                if (theCurLoc.IsValid() && theCurLoc != theMaster) {
+                    mChunkServer.Stop();
+                }
+                mChunkServer.SetRpcFormat(mAllocOp.allCSShortRpcFlag ?
+                    ChunkServer::kRpcFormatShort : ChunkServer::kRpcFormatLong);
+            }
+
             const time_t theNow = Now();
             mHasSubjectIdFlag = false;
             mChunkAccess.clear();
 
             const bool theCSClearTextAllowedFlag =
                 mOuter.IsChunkServerClearTextAllowed();
-            mChunkServer.SetShutdownSsl(
+            GetChunkServer().SetShutdownSsl(
                 mAllocOp.allowCSClearTextFlag &&
                 theCSClearTextAllowedFlag
             );
-            mChunkServer.SetRpcFormat(mAllocOp.allCSShortRpcFlag ?
-                ChunkServer::kRpcFormatShort : ChunkServer::kRpcFormatLong);
             if (mAllocOp.chunkServerAccessToken.empty() ||
                     mAllocOp.chunkAccess.empty()) {
-                mChunkServer.SetKey(0, 0, 0, 0);
-                mChunkServer.SetAuthContext(0);
+                GetChunkServer().SetKey(0, 0, 0, 0);
+                GetChunkServer().SetAuthContext(0);
                 if (! mAllocOp.chunkServerAccessToken.empty()) {
                     mWriteIdAllocOp.status    = -EINVAL;
                     mWriteIdAllocOp.statusMsg = "no chunk access";
@@ -995,7 +1320,7 @@ class Writer::Impl :
                     mCSAccessExpireTime    = mChunkAccessExpireTime;
                 }
             } else {
-                mChunkServer.SetKey(
+                GetChunkServer().SetKey(
                     mAllocOp.chunkServerAccessToken.data(),
                     mAllocOp.chunkServerAccessToken.size(),
                     mAllocOp.chunkServerAccessKey.GetPtr(),
@@ -1019,19 +1344,23 @@ class Writer::Impl :
                 if (mAllocOp.allowCSClearTextFlag &&
                         theCSClearTextAllowedFlag &&
                         mWriteIdAllocOp.createChunkServerAccessFlag) {
-                    mWriteIdAllocOp.decryptKey = &mChunkServer.GetSessionKey();
+                    mWriteIdAllocOp.decryptKey = &GetChunkServer().GetSessionKey();
                 }
-                if (! mChunkServer.GetAuthContext()) {
-                    mChunkServer.SetAuthContext(
+                if (! GetChunkServer().GetAuthContext()) {
+                    GetChunkServer().SetAuthContext(
                         mOuter.mMetaServer.GetAuthContext());
                 }
             }
             if (mWriteIdAllocOp.status == 0) {
                 const bool kCancelPendingOpsFlag = true;
-                if (mChunkServer.SetServer(
-                        mAllocOp.chunkServers[0],
+                if (mChunkServerPtr || mChunkServer.SetServer(
+                        theMaster,
                         kCancelPendingOpsFlag,
                         &mWriteIdAllocOp.statusMsg)) {
+                    if (TryParallelWriteIdAlloc()) {
+                        return;
+                    }
+                    mWriteIdAllocStartUsec = WriterNowUsec();
                     Enqueue(mWriteIdAllocOp);
                     return;
                 }
@@ -1068,7 +1397,7 @@ class Writer::Impl :
             }
             if (0 < inOp.accessResponseValidForSec &&
                     ! inOp.chunkServerAccessId.empty()) {
-                mChunkServer.SetKey(
+                GetChunkServer().SetKey(
                     inOp.chunkServerAccessId.data(),
                     inOp.chunkServerAccessId.size(),
                     inOp.chunkServerAccessKey.GetPtr(),
@@ -1101,8 +1430,8 @@ class Writer::Impl :
                 inOp.subjectId = mWriteIds.front().writeId;
             }
             if (inOp.createChunkServerAccessFlag &&
-                    mChunkServer.IsShutdownSsl()) {
-                inOp.decryptKey = &mChunkServer.GetSessionKey();
+                    GetChunkServer().IsShutdownSsl()) {
+                inOp.decryptKey = &GetChunkServer().GetSessionKey();
             }
             // Roll forward access time to indicate the request is in flight.
             // If op fails or times out, then write restarts from write id
@@ -1114,12 +1443,28 @@ class Writer::Impl :
                 mCSAccessExpireTime = theNow + LEASE_INTERVAL_SECS * 3 / 2;
             }
         }
+        void SetAccess(
+            ChunkAccessOp&   inOp,
+            const WriteInfo& inWriteInfo,
+            bool             inCanRequestAccessFlag = true)
+        {
+            SetAccess(inOp, inCanRequestAccessFlag);
+            if (inOp.hasSubjectIdFlag) {
+                inOp.subjectId = inWriteInfo.writeId;
+            }
+        }
         void Done(
             WriteIdAllocOp& inOp,
             bool            inCanceledFlag,
             IOBuffer*       inBufferPtr)
         {
             QCASSERT(&mWriteIdAllocOp == &inOp && ! inBufferPtr);
+            if (0 < mWriteIdAllocStartUsec) {
+                mOuter.mStats.mWriteIdAllocUsec +=
+                    WriterNowUsec() - mWriteIdAllocStartUsec;
+                mOuter.mStats.mWriteIdAllocCount++;
+                mWriteIdAllocStartUsec = 0;
+            }
             mWriteIds.clear();
             if (inCanceledFlag) {
                 return;
@@ -1224,14 +1569,20 @@ class Writer::Impl :
                 inWriteOp.mWritePrepareOp.replyRequestedFlag
             );
             if (inWriteOp.mWritePrepareOp.replyRequestedFlag) {
-                if (! inWriteOp.mChecksumValidFlag) {
+                if (inWriteOp.mWritePrepareOp.checksums.empty()) {
+                    inWriteOp.mWritePrepareOp.checksums = ComputeChecksums(
+                        &inWriteOp.mBuffer,
+                        inWriteOp.mWritePrepareOp.numBytes,
+                        &inWriteOp.mWritePrepareOp.checksum
+                    );
+                    inWriteOp.mChecksumValidFlag = true;
+                } else if (! inWriteOp.mChecksumValidFlag) {
                     inWriteOp.mWritePrepareOp.checksum = ComputeBlockChecksum(
                         &inWriteOp.mBuffer,
                         inWriteOp.mWritePrepareOp.numBytes
                     );
                     inWriteOp.mChecksumValidFlag = true;
                 }
-                inWriteOp.mWritePrepareOp.checksums.clear();
             } else {
                 if (inWriteOp.mWritePrepareOp.checksums.empty()) {
                     inWriteOp.mWritePrepareOp.checksums = ComputeChecksums(
@@ -1256,12 +1607,159 @@ class Writer::Impl :
                 SetAccess(inWriteOp.mWriteSyncOp);
             }
             inWriteOp.mOpStartTime = Now();
+            inWriteOp.mEnqueueUsec = WriterNowUsec();
             Queue::Remove(mPendingQueue, inWriteOp);
             Queue::PushBack(mInFlightQueue, inWriteOp);
             mOuter.mStats.mOpsWriteCount++;
             mOuter.mStats.mOpsWriteByteCount += inWriteOp.contentLength;
+            if (TryParallelReplicaWrite(inWriteOp)) {
+                return;
+            }
             Enqueue(inWriteOp, &inWriteOp.mBuffer);
         }
+        bool CanParallelReplicaWrite() const
+        {
+            return (
+                mOuter.mParallelReplicaWriteFlag &&
+                mOuter.mClientPoolPtr &&
+                mWriteIdAllocOp.writePrepReplySupportedFlag &&
+                mWriteIds.size() > 1 &&
+                mAllocOp.chunkServerAccessToken.empty() &&
+                mAllocOp.chunkAccess.empty()
+            );
+        }
+        bool TryParallelReplicaWrite(
+            WriteOp& inWriteOp)
+        {
+            if (! CanParallelReplicaWrite()) {
+                return false;
+            }
+            inWriteOp.ClearParallelReplicas();
+            inWriteOp.mParallelReplicas.reserve(mWriteIds.size());
+            for (WriteIds::const_iterator it = mWriteIds.begin();
+                    it != mWriteIds.end();
+                    ++it) {
+                WriteOp::ParallelReplica* const theReplicaPtr =
+                    new WriteOp::ParallelReplica();
+                WritePrepareOp& theOp = theReplicaPtr->mPrepareOp;
+                Reset(theOp);
+                theOp.chunkId            = inWriteOp.mWritePrepareOp.chunkId;
+                theOp.chunkVersion       =
+                    inWriteOp.mWritePrepareOp.chunkVersion;
+                theOp.offset             = inWriteOp.mWritePrepareOp.offset;
+                theOp.numBytes           = inWriteOp.mWritePrepareOp.numBytes;
+                theOp.contentLength      = inWriteOp.contentLength;
+                theOp.checksum           = inWriteOp.mWritePrepareOp.checksum;
+                theOp.checksums          = inWriteOp.mWritePrepareOp.checksums;
+                theOp.replyRequestedFlag = true;
+                theOp.noForwardFlag      = true;
+                theOp.writeInfo = mWriteIds;
+                SetAccess(theOp, *it, true);
+                theReplicaPtr->mBuffer.AppendShared(inWriteOp.mBuffer);
+                theReplicaPtr->mClientPtr = &mOuter.mClientPoolPtr->Get(
+                    it->serverLoc, mAllocOp.allCSShortRpcFlag);
+                inWriteOp.mParallelReplicas.push_back(theReplicaPtr);
+            }
+            for (WriteOp::ParallelReplicas::iterator it =
+                        inWriteOp.mParallelReplicas.begin();
+                    it != inWriteOp.mParallelReplicas.end();
+                    ++it) {
+                EnqueueParallelReplica(inWriteOp, **it);
+            }
+            if (inWriteOp.mParallelDoneCount >=
+                    (int)inWriteOp.mParallelReplicas.size()) {
+                inWriteOp.status = inWriteOp.mParallelStatus;
+                inWriteOp.statusMsg = inWriteOp.mParallelStatusMsg;
+                Done(inWriteOp, false, &inWriteOp.mBuffer);
+            }
+            return true;
+        }
+        void EnqueueParallelReplica(
+            WriteOp&                  inWriteOp,
+            WriteOp::ParallelReplica& inReplica)
+        {
+            KFS_LOG_STREAM_DEBUG << mLogPrefix <<
+                "+> parallel " << inReplica.mPrepareOp.Show() <<
+                " buffer: " << static_cast<void*>(&inReplica.mBuffer) <<
+                "/" << inReplica.mBuffer.BytesConsumable() <<
+            KFS_LOG_EOM;
+            mOuter.mStats.mChunkOpsQueuedCount++;
+            if (! inReplica.mClientPtr->Enqueue(
+                    &inReplica.mPrepareOp,
+                    this,
+                    &inReplica.mBuffer)) {
+                inReplica.mPrepareOp.status = kErrorFault;
+                inReplica.mDoneFlag = true;
+                inWriteOp.mParallelDoneCount++;
+                if (inWriteOp.mParallelStatus == 0) {
+                    inWriteOp.mParallelStatus = kErrorFault;
+                    inWriteOp.mParallelStatusMsg = "parallel write enqueue failure";
+                }
+            }
+        }
+        bool DoneParallelReplica(
+            WriteOp&                  inWriteOp,
+            WriteOp::ParallelReplica& inReplica,
+            bool                      inCanceledFlag,
+            IOBuffer*                 inBufferPtr)
+        {
+            if (inReplica.mDoneFlag) {
+                return false;
+            }
+            QCASSERT(inBufferPtr == &inReplica.mBuffer);
+            inReplica.mDoneFlag = true;
+            inWriteOp.mParallelDoneCount++;
+            if ((inCanceledFlag || inReplica.mPrepareOp.status < 0) &&
+                    inWriteOp.mParallelStatus == 0) {
+                inWriteOp.mParallelStatus =
+                    inCanceledFlag ? kErrorIo :
+                        inReplica.mPrepareOp.status;
+                inWriteOp.mParallelStatusMsg =
+                    inCanceledFlag ? "parallel write canceled" :
+                        inReplica.mPrepareOp.statusMsg;
+            } else if (inReplica.mPrepareOp.status == 0 &&
+                    inWriteOp.mWritePrepareOp.chunkAccessResponse.empty()) {
+                inWriteOp.mWritePrepareOp.chunkAccessResponse =
+                    inReplica.mPrepareOp.chunkAccessResponse;
+                inWriteOp.mWritePrepareOp.chunkServerAccessId =
+                    inReplica.mPrepareOp.chunkServerAccessId;
+                inWriteOp.mWritePrepareOp.chunkServerAccessKey =
+                    inReplica.mPrepareOp.chunkServerAccessKey;
+                inWriteOp.mWritePrepareOp.accessResponseIssued =
+                    inReplica.mPrepareOp.accessResponseIssued;
+                inWriteOp.mWritePrepareOp.accessResponseValidForSec =
+                    inReplica.mPrepareOp.accessResponseValidForSec;
+            }
+            if (inWriteOp.mParallelDoneCount <
+                    (int)inWriteOp.mParallelReplicas.size()) {
+                return true;
+            }
+            inWriteOp.status = inWriteOp.mParallelStatus;
+            inWriteOp.statusMsg = inWriteOp.mParallelStatusMsg;
+            Done(inWriteOp, false, &inWriteOp.mBuffer);
+            return true;
+        }
+        bool DoneParallelReplica(
+            KfsOp*    inOpPtr,
+            bool      inCanceledFlag,
+            IOBuffer* inBufferPtr)
+        {
+            Queue::Iterator theIt(mInFlightQueue);
+            WriteOp* theWriteOpPtr;
+            while ((theWriteOpPtr = theIt.Next())) {
+                for (WriteOp::ParallelReplicas::iterator it =
+                            theWriteOpPtr->mParallelReplicas.begin();
+                        it != theWriteOpPtr->mParallelReplicas.end();
+                        ++it) {
+                    if (&(*it)->mPrepareOp == inOpPtr) {
+                        return DoneParallelReplica(
+                            *theWriteOpPtr, **it,
+                            inCanceledFlag, inBufferPtr);
+                    }
+                }
+            }
+            return false;
+        }
         void Done(
             WriteOp&  inOp,
             bool      inCanceledFlag,
@@ -1282,13 +1780,19 @@ class Writer::Impl :
                     Monitor::ReportError(
                             Monitor::kWriteOpError,
                             mOuter.mMetaServer.GetMetaServerLocation(),
-                            mChunkServer.GetServerLocation(),
+                            GetChunkServer().GetServerLocation(),
                             inOp.status);
                     mOpStartTime = inOp.mOpStartTime;
                     HandleError(inOp);
                 }
                 return;
             }
+            if (0 < inOp.mEnqueueUsec) {
+                mOuter.mStats.mChunkWriteUsec +=
+                    WriterNowUsec() - inOp.mEnqueueUsec;
+                mOuter.mStats.mChunkWriteCount++;
+                inOp.mEnqueueUsec = 0;
+            }
             const Offset theOffset    = inOp.mWritePrepareOp.offset;
             const Offset theDoneCount = inOp.mBuffer.BytesConsumable();
             QCASSERT(
@@ -1349,10 +1853,121 @@ class Writer::Impl :
             UpdateLeaseExpirationTime();
             StartWrite();
         }
+        bool TryParallelCloseChunk()
+        {
+            if (! CanParallelReplicaChunkOps() || mCloseOp.chunkVersion < 0 ||
+                    mCloseOp.writeInfo.empty()) {
+                return false;
+            }
+            ClearParallelCloseReplicas();
+            mParallelCloseReplicas.reserve(mCloseOp.writeInfo.size());
+            for (WriteIds::const_iterator it = mCloseOp.writeInfo.begin();
+                    it != mCloseOp.writeInfo.end();
+                    ++it) {
+                ParallelCloseReplica* const theReplicaPtr =
+                    new ParallelCloseReplica();
+                CloseOp& theOp = theReplicaPtr->mOp;
+                Reset(theOp);
+                theOp.chunkId       = mCloseOp.chunkId;
+                theOp.chunkVersion  = mCloseOp.chunkVersion;
+                theOp.writeInfo     = mCloseOp.writeInfo;
+                theOp.noForwardFlag = true;
+                SetAccess(theOp, *it, true);
+                theReplicaPtr->mClientPtr = &mOuter.mClientPoolPtr->Get(
+                    it->serverLoc, mAllocOp.allCSShortRpcFlag);
+                mParallelCloseReplicas.push_back(theReplicaPtr);
+            }
+            mLastOpPtr = &mCloseOp;
+            mChunkCloseStartUsec = WriterNowUsec();
+            for (ParallelCloseReplicas::iterator it =
+                        mParallelCloseReplicas.begin();
+                    it != mParallelCloseReplicas.end();
+                    ++it) {
+                EnqueueParallelClose(**it);
+            }
+            if (mParallelCloseDoneCount >=
+                    (int)mParallelCloseReplicas.size()) {
+                DoneParallelClose();
+            }
+            return true;
+        }
+        void EnqueueParallelClose(
+            ParallelCloseReplica& inReplica)
+        {
+            KFS_LOG_STREAM_DEBUG << mLogPrefix <<
+                "+> parallel " << inReplica.mOp.Show() <<
+            KFS_LOG_EOM;
+            mOuter.mStats.mChunkOpsQueuedCount++;
+            if (! inReplica.mClientPtr->Enqueue(&inReplica.mOp, this, 0)) {
+                inReplica.mOp.status = kErrorFault;
+                inReplica.mDoneFlag = true;
+                mParallelCloseDoneCount++;
+                if (mParallelCloseStatus == 0) {
+                    mParallelCloseStatus = kErrorFault;
+                    mParallelCloseStatusMsg =
+                        "parallel close enqueue failure";
+                }
+            }
+        }
+        bool DoneParallelClose(
+            KfsOp*    inOpPtr,
+            bool      inCanceledFlag,
+            IOBuffer* inBufferPtr)
+        {
+            if (mParallelCloseReplicas.empty()) {
+                return false;
+            }
+            for (ParallelCloseReplicas::iterator it =
+                        mParallelCloseReplicas.begin();
+                    it != mParallelCloseReplicas.end();
+                    ++it) {
+                if (&(*it)->mOp == inOpPtr) {
+                    return DoneParallelClose(
+                        **it, inCanceledFlag, inBufferPtr);
+                }
+            }
+            return false;
+        }
+        bool DoneParallelClose(
+            ParallelCloseReplica& inReplica,
+            bool                  inCanceledFlag,
+            IOBuffer*             inBufferPtr)
+        {
+            QCASSERT(! inBufferPtr);
+            if (inReplica.mDoneFlag) {
+                return false;
+            }
+            inReplica.mDoneFlag = true;
+            mParallelCloseDoneCount++;
+            if ((inCanceledFlag || inReplica.mOp.status < 0) &&
+                    mParallelCloseStatus == 0) {
+                mParallelCloseStatus =
+                    inCanceledFlag ? kErrorIo : inReplica.mOp.status;
+                mParallelCloseStatusMsg =
+                    inCanceledFlag ? "parallel close canceled" :
+                        inReplica.mOp.statusMsg;
+            }
+            if (mParallelCloseDoneCount <
+                    (int)mParallelCloseReplicas.size()) {
+                return true;
+            }
+            DoneParallelClose();
+            return true;
+        }
+        void DoneParallelClose()
+        {
+            if (mLastOpPtr == &mCloseOp) {
+                mLastOpPtr = 0;
+            }
+            mCloseOp.status = mParallelCloseStatus;
+            mCloseOp.statusMsg = mParallelCloseStatusMsg;
+            Done(mCloseOp, false, 0);
+        }
         void CloseChunk()
         {
             QCASSERT(mAllocOp.chunkId > 0);
             Reset(mCloseOp);
+            ClearParallelCloseReplicas();
             mCloseOp.chunkId      = mAllocOp.chunkId;
             mCloseOp.chunkVersion = mAllocOp.chunkVersion;
             mCloseOp.writeInfo    = mWriteIds;
@@ -1362,6 +1977,9 @@ class Writer::Impl :
                 mCloseOp.chunkServerLoc.clear();
             }
             SetAccess(mCloseOp);
+            if (TryParallelCloseChunk()) {
+                return;
+            }
             if (mCloseOp.chunkVersion < 0) {
                 // Extend timeout to accommodate object commit, possibly single
                 // atomic 64MB "object" write.
@@ -1377,10 +1995,11 @@ class Writer::Impl :
                     " version: "             << mCloseOp.chunkVersion <<
                     " chunk close timeout: " << theTimeout << " sec." <<
                 KFS_LOG_EOM;
-                mChunkServer.SetOpTimeoutSec(theTimeout);
+                GetChunkServer().SetOpTimeoutSec(theTimeout);
             }
             mWriteIds.clear();
             mAllocOp.chunkId = -1;
+            mChunkCloseStartUsec = WriterNowUsec();
             Enqueue(mCloseOp);
         }
         void Done(
@@ -1389,9 +2008,15 @@ class Writer::Impl :
             IOBuffer* inBufferPtr)
         {
             QCASSERT(&mCloseOp == &inOp && ! inBufferPtr);
+            if (0 < mChunkCloseStartUsec) {
+                mOuter.mStats.mChunkCloseUsec +=
+                    WriterNowUsec() - mChunkCloseStartUsec;
+                mOuter.mStats.mChunkCloseCount++;
+                mChunkCloseStartUsec = 0;
+            }
             if (mCloseOp.chunkVersion < 0) {
                 // Restore timeout, changed by CloseChunk().
-                mChunkServer.SetOpTimeoutSec(mOuter.mOpTimeoutSec);
+                GetChunkServer().SetOpTimeoutSec(mOuter.mOpTimeoutSec);
             }
             if (inCanceledFlag) {
                 return;
@@ -1408,7 +2033,10 @@ class Writer::Impl :
             }
             mKeepLeaseFlag = false;
             mCloseOp.chunkId = -1;
+            const int64_t resetStartUsec = WriterNowUsec();
             Reset();
+            mOuter.mStats.mChunkResetUsec += WriterNowUsec() - resetStartUsec;
+            mOuter.mStats.mChunkResetCount++;
             StartWrite();
         }
         virtual void OpDone(
@@ -1449,12 +2077,21 @@ class Writer::Impl :
                 Done(mAllocOp, inCanceledFlag, inBufferPtr);
             } else if (&mWriteIdAllocOp == inOpPtr) {
                 Done(mWriteIdAllocOp, inCanceledFlag, inBufferPtr);
+            } else if (DoneParallelWriteIdAlloc(
+                    inOpPtr, inCanceledFlag, inBufferPtr)) {
+                return;
             } else if (&mAllocOp == inOpPtr) {
                 Done(mAllocOp, inCanceledFlag, inBufferPtr);
             } else if (&mCloseOp == inOpPtr) {
                 Done(mCloseOp, inCanceledFlag, inBufferPtr);
+            } else if (DoneParallelClose(
+                    inOpPtr, inCanceledFlag, inBufferPtr)) {
+                return;
             } else if (&mUpdateLeaseOp == inOpPtr) {
                 Done(mUpdateLeaseOp, inCanceledFlag, inBufferPtr);
+            } else if (DoneParallelReplica(
+                    inOpPtr, inCanceledFlag, inBufferPtr)) {
+                return;
             } else if (inOpPtr && inOpPtr->op == CMD_WRITE) {
                 Done(*static_cast<WriteOp*>(inOpPtr),
                     inCanceledFlag, inBufferPtr);
@@ -1462,10 +2099,22 @@ class Writer::Impl :
                 mOuter.InternalError("unexpected operation completion");
             }
         }
+        void StopChunkServer()
+        {
+            if (mChunkServerPtr) {
+                mChunkServerPtr->CancelAllWithOwner(this);
+                mChunkServerPtr = 0;
+            }
+            mChunkServer.Stop();
+        }
+        ChunkServer& GetChunkServer()
+            { return (mChunkServerPtr ? *mChunkServerPtr : mChunkServer); }
+        const ChunkServer& GetChunkServer() const
+            { return (mChunkServerPtr ? *mChunkServerPtr : mChunkServer); }
         void Enqueue(
             KfsOp&    inOp,
             IOBuffer* inBufferPtr = 0)
-            { EnqueueSelf(inOp, inBufferPtr, &mChunkServer, 0); }
+            { EnqueueSelf(inOp, inBufferPtr, &GetChunkServer(), 0); }
         void EnqueueMeta(
             KfsOp&    inOp,
             IOBuffer* inBufferPtr    = 0,
@@ -1480,7 +2129,9 @@ class Writer::Impl :
             mWriteIds.clear();
             mAllocOp.chunkId = 0;
             mLastOpPtr       = 0;
-            mChunkServer.Stop();
+            StopChunkServer();
+            ClearParallelWriteIdReplicas();
+            ClearParallelCloseReplicas();
             QCASSERT(Queue::IsEmpty(mInFlightQueue));
             if (mSleepingFlag) {
                 mSleepTimer.RemoveTimeout();
@@ -1532,9 +2183,9 @@ class Writer::Impl :
                 " status: "               << inOp.status    <<
                 " msg: "                  << inOp.statusMsg <<
                 " op: "                   << inOp.Show()    <<
-                " current chunk server: " << mChunkServer.GetServerLocation() <<
-                " chunkserver: "          << (mChunkServer.IsDataSent() ?
-                    (mChunkServer.IsAllDataSent() ? "all" : "partial") :
+                " current chunk server: " << GetChunkServer().GetServerLocation() <<
+                " chunkserver: "          << (GetChunkServer().IsDataSent() ?
+                    (GetChunkServer().IsAllDataSent() ? "all" : "partial") :
                     "no") << " data sent" <<
                 " retry: "                << mRetryCount <<
                 "\nRequest:\n"            << theOStream.str() <<
@@ -1757,6 +2408,8 @@ class Writer::Impl :
     Offset              mOffset;
     Offset              mOpenChunkBlockSize;
     int64_t             mChunkServerInitialSeqNum;
+    ClientPool* const   mClientPoolPtr;
+    const bool          mParallelReplicaWriteFlag;
     Completion*         mCompletionPtr;
     IOBuffer            mBuffer;
     string const        mLogPrefix;
@@ -1768,6 +2421,8 @@ class Writer::Impl :
     int                 mCompletionDepthCount;
     int                 mStriperProcessCount;
     Striper*            mStriperPtr;
+    int64_t             mCloseStartUsec;
+    int64_t             mSetSizeStartUsec;
     ChunkWriter*        mWriters[1];
 
     void InternalError(
@@ -1872,6 +2527,7 @@ class Writer::Impl :
         mTruncateOp.fid        = mFileId;
         mTruncateOp.fileOffset = theSize;
         mTruncateOp.status     = 0;
+        mSetSizeStartUsec   = WriterNowUsec();
         KFS_LOG_STREAM_DEBUG << mLogPrefix <<
             "meta +> " << mTruncateOp.Show() <<
         KFS_LOG_EOM;
@@ -1896,6 +2552,11 @@ class Writer::Impl :
         if (inOpPtr != &mTruncateOp) {
             return;
         }
+        if (0 < mSetSizeStartUsec) {
+            mStats.mSetSizeUsec += WriterNowUsec() - mSetSizeStartUsec;
+            mStats.mSetSizeCount++;
+            mSetSizeStartUsec = 0;
+        }
         mTruncateOp.pathname = 0;
         mTruncateOp.fid      = -1;
         if (inCanceledFlag) {
@@ -2121,6 +2782,11 @@ class Writer::Impl :
             if (mClosingFlag && Writers::IsEmpty(mWriters) && ! mSleepingFlag) {
                 SetFileSize();
                 if (mTruncateOp.fid < 0 && ! mSleepingFlag) {
+                    if (0 < mCloseStartUsec) {
+                        mStats.mCloseUsec += WriterNowUsec() - mCloseStartUsec;
+                        mStats.mCloseCount++;
+                        mCloseStartUsec = 0;
+                    }
                     mClosingFlag = false;
                     mFileId = -1;
                     Striper* const theStriperPtr = mStriperPtr;
@@ -2214,7 +2880,9 @@ Writer::Writer(
     int                 inIdleTimeoutSec,
     int                 inMaxWriteSize,
     const char*         inLogPrefixPtr,
-    int64_t             inChunkServerInitialSeqNum)
+    int64_t             inChunkServerInitialSeqNum,
+    ClientPool*         inClientPoolPtr,
+    bool                inParallelReplicaWriteFlag)
     : mImpl(*new Writer::Impl(
         *this,
         inMetaServer,
@@ -2228,7 +2896,9 @@ Writer::Writer(
         inMaxWriteSize,
         (inLogPrefixPtr && inLogPrefixPtr[0]) ?
             (inLogPrefixPtr + string(" ")) : string(),
-        inChunkServerInitialSeqNum
+        inChunkServerInitialSeqNum,
+        inClientPoolPtr,
+        inParallelReplicaWriteFlag
     ))
 {
     mImpl.Ref();
diff --git a/src/cc/libclient/Writer.h b/src/cc/libclient/Writer.h
index 10d6b2a19..aef754650 100644
--- a/src/cc/libclient/Writer.h
+++ b/src/cc/libclient/Writer.h
@@ -40,6 +40,8 @@ namespace client
 {
 using std::string;
 
+class ClientPool;
+
 // Kfs client write protocol state machine.
 class Writer
 {
@@ -85,7 +87,21 @@ class Writer
               mRetriesCount(0),
               mWriteCount(0),
               mWriteByteCount(0),
-              mBufferCompactionCount(0)
+              mBufferCompactionCount(0),
+              mCloseCount(0),
+              mCloseUsec(0),
+              mSetSizeCount(0),
+              mSetSizeUsec(0),
+              mChunkCloseCount(0),
+              mChunkCloseUsec(0),
+              mChunkWriteCount(0),
+              mChunkWriteUsec(0),
+              mChunkResetCount(0),
+              mChunkResetUsec(0),
+              mWriteIdAllocCount(0),
+              mWriteIdAllocUsec(0),
+              mAllocateCount(0),
+              mAllocateUsec(0)
             {}
         void Clear()
             { *this = Stats(); }
@@ -104,6 +120,20 @@ class Writer
             mWriteCount            += inStats.mWriteCount;
             mWriteByteCount        += inStats.mWriteByteCount;
             mBufferCompactionCount += inStats.mBufferCompactionCount;
+            mCloseCount           += inStats.mCloseCount;
+            mCloseUsec            += inStats.mCloseUsec;
+            mSetSizeCount         += inStats.mSetSizeCount;
+            mSetSizeUsec          += inStats.mSetSizeUsec;
+            mChunkCloseCount      += inStats.mChunkCloseCount;
+            mChunkCloseUsec       += inStats.mChunkCloseUsec;
+            mChunkWriteCount      += inStats.mChunkWriteCount;
+            mChunkWriteUsec       += inStats.mChunkWriteUsec;
+            mChunkResetCount      += inStats.mChunkResetCount;
+            mChunkResetUsec       += inStats.mChunkResetUsec;
+            mWriteIdAllocCount   += inStats.mWriteIdAllocCount;
+            mWriteIdAllocUsec    += inStats.mWriteIdAllocUsec;
+            mAllocateCount       += inStats.mAllocateCount;
+            mAllocateUsec        += inStats.mAllocateUsec;
             return *this;
         }
         template<typename T>
@@ -122,6 +152,20 @@ class Writer
             inFunctor("Retries",           mRetriesCount);
             inFunctor("Writes" ,           mWriteCount);
             inFunctor("WriteBytes",        mWriteByteCount);
+            inFunctor("CloseCount",       mCloseCount);
+            inFunctor("CloseUsec",        mCloseUsec);
+            inFunctor("SetSizeCount",     mSetSizeCount);
+            inFunctor("SetSizeUsec",      mSetSizeUsec);
+            inFunctor("ChunkCloseCount",  mChunkCloseCount);
+            inFunctor("ChunkCloseUsec",   mChunkCloseUsec);
+            inFunctor("ChunkWriteCount",  mChunkWriteCount);
+            inFunctor("ChunkWriteUsec",   mChunkWriteUsec);
+            inFunctor("ChunkResetCount",  mChunkResetCount);
+            inFunctor("ChunkResetUsec",   mChunkResetUsec);
+            inFunctor("WriteIdAllocCount", mWriteIdAllocCount);
+            inFunctor("WriteIdAllocUsec",  mWriteIdAllocUsec);
+            inFunctor("AllocateCount",    mAllocateCount);
+            inFunctor("AllocateUsec",     mAllocateUsec);
         }
         Counter mMetaOpsQueuedCount;
         Counter mMetaOpsCancelledCount;
@@ -135,6 +179,20 @@ class Writer
         Counter mWriteCount;
         Counter mWriteByteCount;
         Counter mBufferCompactionCount;
+        Counter mCloseCount;
+        Counter mCloseUsec;
+        Counter mSetSizeCount;
+        Counter mSetSizeUsec;
+        Counter mChunkCloseCount;
+        Counter mChunkCloseUsec;
+        Counter mChunkWriteCount;
+        Counter mChunkWriteUsec;
+        Counter mChunkResetCount;
+        Counter mChunkResetUsec;
+        Counter mWriteIdAllocCount;
+        Counter mWriteIdAllocUsec;
+        Counter mAllocateCount;
+        Counter mAllocateUsec;
     };
     class Striper
     {
@@ -200,7 +258,9 @@ class Writer
         int         inIdleTimeoutSec,
         int         inMaxWriteSize,
         const char* inLogPrefixPtr,
-        int64_t     inChunkServerInitialSeqNum);
+        int64_t     inChunkServerInitialSeqNum,
+        ClientPool* inClientPoolPtr = 0,
+        bool        inParallelReplicaWriteFlag = false);
     virtual ~Writer();
     int Open(
         kfsFileId_t inFileId,
diff --git a/src/cc/meta/CMakeLists.txt b/src/cc/meta/CMakeLists.txt
index 734cb4405..77524ab57 100644
--- a/src/cc/meta/CMakeLists.txt
+++ b/src/cc/meta/CMakeLists.txt
@@ -35,9 +35,11 @@ set (lib_srcs
     kfsops.cc
     kfstree.cc
     LayoutManager.cc
+    layoutmanager_instance.cc
     meta.cc
     MetaRequest.cc
     NetDispatch.cc
+    NamespaceV2.cc
     Replay.cc
     Restorer.cc
     util.cc
@@ -62,7 +64,7 @@ target_link_libraries(kfsMeta
 )
 
 if (NOT USE_STATIC_LIB_LINKAGE)
-    add_library (kfsMeta-shared SHARED ${lib_srcs} layoutmanager_instance.cc)
+    add_library (kfsMeta-shared SHARED ${lib_srcs})
     set_target_properties (kfsMeta-shared PROPERTIES OUTPUT_NAME "qfs_meta")
     set_target_properties (kfsMeta-shared PROPERTIES CLEAN_DIRECT_OUTPUT 1)
     target_link_libraries(kfsMeta-shared
@@ -114,6 +116,21 @@ if (CMAKE_SYSTEM_NAME STREQUAL "SunOS")
    target_link_libraries(kfsMeta umem)
 endif (CMAKE_SYSTEM_NAME STREQUAL "SunOS")
 
+add_executable (namespacev2test namespacev2test_main.cc NamespaceV2.cc)
+target_link_libraries(namespacev2test kfsCommon)
+add_dependencies(namespacev2test kfsCommon)
+qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/devtools" namespacev2test)
+
+add_executable (namespacev2bench namespacev2bench_main.cc NamespaceV2.cc)
+target_link_libraries(namespacev2bench kfsCommon)
+add_dependencies(namespacev2bench kfsCommon)
+qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/devtools" namespacev2bench)
+
+add_executable (namespacev2walreplaytest namespacev2walreplaytest_main.cc)
+target_link_libraries(namespacev2walreplaytest kfsMeta)
+add_dependencies(namespacev2walreplaytest kfsMeta)
+qfs_set_target_runtime_output_dir("${QFS_OUTPUT_DIR}/bin/devtools" namespacev2walreplaytest)
+
 if (CYGWIN)
     # Workaround for "too many sections" asm failure by turning on optimization
     # for compiler to inline for all build types.
diff --git a/src/cc/meta/Checkpoint.cc b/src/cc/meta/Checkpoint.cc
index c28c811ea..60331d006 100644
--- a/src/cc/meta/Checkpoint.cc
+++ b/src/cc/meta/Checkpoint.cc
@@ -41,6 +41,7 @@
 #include "MetaVrSM.h"
 #include "MetaVrLogSeq.h"
 #include "util.h"
+#include "NamespaceV2.h"
 
 #include "common/MdStream.h"
 #include "common/FdWriter.h"
@@ -169,6 +170,9 @@ Checkpoint::write(
         if (status == 0 && os) {
             status = gNetDispatch.CheckpointCryptoKeys(os);
         }
+        if (status == 0 && os && NamespaceV2::GetConfig().enabledFlag) {
+            status = NamespaceV2::GetStore().SaveCheckpointDiskEntry(os);
+        }
         if (status == 0) {
             os << "worm/" << (getWORMMode() ? 1 : 0) << '\n';
             os << "time/" << DisplayIsoDateTime() << '\n';
diff --git a/src/cc/meta/ChunkServer.cc b/src/cc/meta/ChunkServer.cc
index 7e701eed4..8c13aa51e 100644
--- a/src/cc/meta/ChunkServer.cc
+++ b/src/cc/meta/ChunkServer.cc
@@ -290,6 +290,7 @@ int ChunkServer::sMakeStableTimeout        = 330;
 int ChunkServer::sReplicationTimeout       = 510;
 int ChunkServer::sRequestTimeout           = 600;
 int ChunkServer::sMetaClientPort           = 0;
+bool ChunkServer::sSkipChunkAllocateInFlightLogFlag = false;
 int ChunkServer::sTimedoutExpireTime       = 10;
 size_t ChunkServer::sMaxChunksToEvacuate  = 2 << 10; // Max queue size
 // sHeartbeatInterval * sSrvLoadSamplerSampleCount -- boxcar FIR filter
@@ -399,6 +400,9 @@ void ChunkServer::SetParameters(const Properties& prop, int clientPort)
     sMaxPendingOpsCount = max(8, prop.getValue(
         "metaServer.chunkServer.maxPendingOpsCount",
         sMaxPendingOpsCount));
+    sSkipChunkAllocateInFlightLogFlag = prop.getValue(
+        "metaServer.chunkServer.skipChunkAllocateInFlightLog",
+        sSkipChunkAllocateInFlightLogFlag ? 1 : 0) != 0;
     if (clientPort > 0) {
         sMetaClientPort = clientPort;
     }
@@ -2203,6 +2207,30 @@ ChunkServer::HandleReply(IOBuffer* iobuf, int msgLen)
         op->status = -KfsToSysErrno(-op->status);
     }
     op->handleReply(prop);
+    if (op->op == META_CHUNK_ALLOCATE) {
+        const int64_t nowUsec = microseconds();
+        const int64_t submitUsec = op->submitTime;
+        const int64_t elapsedUsec = submitUsec > 0 ?
+            nowUsec - submitUsec : 0;
+        const MetaChunkAllocate* const allocOp =
+            static_cast<const MetaChunkAllocate*>(op);
+        const int64_t allocWaitUsec = allocOp->req &&
+                allocOp->req->debugAfterLayoutUsec > 0 ?
+            nowUsec - allocOp->req->debugAfterLayoutUsec : 0;
+        if (100000 <= allocWaitUsec || 100000 <= elapsedUsec) {
+            KFS_LOG_STREAM_INFO << GetServerLocation() <<
+                " meta-chunk-allocate reply timing:"
+                " seq: " << op->opSeqno <<
+                " chunk: " << op->chunkId <<
+                " status: " << op->status <<
+                " submit-to-reply-usec: " << elapsedUsec <<
+                " layout-to-reply-usec: " << allocWaitUsec <<
+                " process-usec: " << op->processTime <<
+                " msg-len: " << msgLen <<
+                " recursion: " << mRecursionCount <<
+            KFS_LOG_EOM;
+        }
+    }
     KFS_LOG_STREAM_DEBUG << GetServerLocation() <<
         " cs-reply:"
         " -seq: "   << op->opSeqno <<
@@ -2663,7 +2691,9 @@ ChunkServer::Enqueue(MetaChunkRequest& req,
             req.inFlightIt = sChunkOpsInFlight.insert(
                 make_pair(chunkIdInFlight, &req));
         }
-        if (! req.replayFlag) {
+        if (! req.replayFlag &&
+                (! sSkipChunkAllocateInFlightLogFlag ||
+                    META_CHUNK_ALLOCATE != req.op)) {
             mLogInFlightCount++;
             if (MetaChunkLogInFlight::Log(req, timeout, removeReplicaFlag)) {
                 return;
diff --git a/src/cc/meta/ChunkServer.h b/src/cc/meta/ChunkServer.h
index 9abd51380..a5dd5d80c 100644
--- a/src/cc/meta/ChunkServer.h
+++ b/src/cc/meta/ChunkServer.h
@@ -1060,6 +1060,7 @@ class ChunkServer :
     static int    sReplicationTimeout;
     static int    sRequestTimeout;
     static int    sMetaClientPort;
+    static bool   sSkipChunkAllocateInFlightLogFlag;
     static bool   sRestartCSOnInvalidClusterKeyFlag;
     static int    sSrvLoadSamplerSampleCount;
     static size_t sMaxChunksToEvacuate;
diff --git a/src/cc/meta/ClientManager.h b/src/cc/meta/ClientManager.h
index 54fbd390c..1d8043dc1 100644
--- a/src/cc/meta/ClientManager.h
+++ b/src/cc/meta/ClientManager.h
@@ -71,6 +71,10 @@ class ClientManager
         }
         return EnqueueSelf(thread, op);
     }
+    void EnqueueBatch(
+        ClientThread*         thread,
+        MetaRequest* const*   reqs,
+        size_t                count);
     static void SubmitRequest(ClientThread* thread, MetaRequest& op)
     {
         if (thread) {
diff --git a/src/cc/meta/ClientSM.h b/src/cc/meta/ClientSM.h
index ce040ecf6..a4eba9356 100644
--- a/src/cc/meta/ClientSM.h
+++ b/src/cc/meta/ClientSM.h
@@ -105,6 +105,8 @@ class ClientSM :
     bool Handle(MetaAllocate& op);
     int& GetLogQueueCounter()
         { return mLogQueueCounter; }
+    ClientManager::ClientThread* GetClientThread() const
+        { return mClientThread; }
 private:
     /// A handle to a network connection
     NetConnectionPtr                   mNetConnection;
diff --git a/src/cc/meta/LayoutManager.cc b/src/cc/meta/LayoutManager.cc
index 130746c14..aee36b7bd 100644
--- a/src/cc/meta/LayoutManager.cc
+++ b/src/cc/meta/LayoutManager.cc
@@ -32,6 +32,7 @@
 #include "ClientSM.h"
 #include "NetDispatch.h"
 #include "LogWriter.h"
+#include "NamespaceV2.h"
 
 #include "qcdio/QCIoBufferPool.h"
 #include "qcdio/QCUtils.h"
@@ -2059,6 +2060,7 @@ LayoutManager::LayoutManager()
       mConcurrentWritesPerNodeWatermark(10),
       mMaxSpaceUtilizationThreshold(0.95),
       mUseFsTotalSpaceFlag(true),
+      mHdfsLikeAllocateFlag(false),
       mChunkAllocMinAvailSpace(2 * (int64_t)CHUNKSIZE),
       mCompleteReplicationCheckInterval(30 * kSecs2MicroSecs),
       mCompleteReplicationCheckTime(
@@ -2392,6 +2394,9 @@ LayoutManager::SetParameters(const Properties& props, int clientPort)
     mUseFsTotalSpaceFlag = props.getValue(
         "metaServer.useFsTotalSpace",
         mUseFsTotalSpaceFlag ? 1 : 0) != 0;
+    mHdfsLikeAllocateFlag = props.getValue(
+        "metaServer.writeFlow.hdfsLikeAllocate",
+        mHdfsLikeAllocateFlag ? 1 : 0) != 0;
     mChunkAllocMinAvailSpace = props.getValue(
         "metaServer.chunkAllocMinAvailSpace",
         mChunkAllocMinAvailSpace);
@@ -2483,6 +2488,7 @@ LayoutManager::SetParameters(const Properties& props, int clientPort)
         mChunkReplicator.GetTimeoutInterval() * 1e-3) * 1e3));
 
     mCheckpoint.GetOp().SetParameters(props);
+    NamespaceV2::SetParameters(props);
 
     mCSCountersUpdateInterval = props.getValue(
         "metaServer.CSCountersUpdateInterval",
@@ -5034,6 +5040,17 @@ LayoutManager::AddNotStableChunk(
         return "chunk was open for append";
     }
     const seq_t curChunkVersion = pinfo.GetChunkInfo()->chunkVersion;
+    if (mHdfsLikeAllocateFlag && ! appendFlag &&
+            chunkVersion == 0 && curChunkVersion > 0) {
+        KFS_LOG_STREAM_INFO << logPrefix <<
+            " not stable chunk:"
+            " <"       << fileId <<
+            ","        << chunkId << ">" <<
+            " remapping dirty version 0 to current version " <<
+            curChunkVersion <<
+        KFS_LOG_EOM;
+        chunkVersion = curChunkVersion;
+    }
     if (chunkVersion < curChunkVersion) {
         return "lower chunk version";
     }
@@ -7158,6 +7175,19 @@ LayoutManager::AllocateChunk(
     if (req.appendChunk) {
         mARAChunkCache.RequestNew(req);
     }
+    if (mHdfsLikeAllocateFlag && ! req.appendChunk &&
+            ! req.stripedFileFlag && 0 < req.numReplicas &&
+            0 <= req.chunkVersion) {
+        KFS_LOG_STREAM_DEBUG <<
+            "hdfs-like allocate: deferred chunk create"
+            " fid: "     << req.fid <<
+            " chunk: "   << req.chunkId <<
+            " version: " << req.chunkVersion <<
+            " replicas: " << req.servers.size() <<
+        KFS_LOG_EOM;
+        req.LayoutDone(0);
+        return 0;
+    }
     for (size_t i = req.servers.size(); i-- > 0; ) {
         req.servers[i]->AllocateChunk(req, i == 0 ? req.leaseId : -1, tiers[i]);
     }
@@ -10124,7 +10154,7 @@ LayoutManager::MakeChunkStableInit(
         KFS_LOG_EOM;
         return;
     }
-    KFS_LOG_STREAM_INFO << logPrefix <<
+    KFS_LOG_STREAM_DEBUG << logPrefix <<
         " <" << fid << "," << chunkId << ">"
         " name: "     << pathname <<
         " version: "  << chunkVersion <<
@@ -10486,7 +10516,7 @@ LayoutManager::LogMakeChunkStableDone(MetaLogMakeChunkStable& req)
     info.serverAddedFlag              = false;
     info.chunkSize                    = req.chunkSize;
     info.chunkChecksum                = req.chunkChecksum;
-    KFS_LOG_STREAM_INFO << logPrefix <<
+    KFS_LOG_STREAM_DEBUG << logPrefix <<
         " <" << req.fid << "," << req.chunkId  << ">"
         " starting MCS"
         " version: "  << req.chunkVersion  <<
@@ -10514,6 +10544,86 @@ LayoutManager::LogMakeChunkStableDone(MetaLogMakeChunkStable& req)
     ));
 }
 
+bool
+LayoutManager::ScheduleTruncateToLastRecoverableChunk(
+    fid_t      fid,
+    chunkId_t  chunkId,
+    chunkOff_t chunkSize)
+{
+    if (! mHdfsLikeAllocateFlag || ! mPrimaryFlag || fid < 0) {
+        return false;
+    }
+    StTmp<vector<MetaChunkInfo*> > cinfoTmp(mChunkInfosTmp);
+    vector<MetaChunkInfo*>&        chunks = cinfoTmp.Get();
+    MetaFattr*                     fa     = 0;
+    const int status = metatree.getalloc(fid, fa, chunks, 0);
+    if (status != 0 || ! fa || KFS_FILE != fa->type || fa->IsSymLink() ||
+            fa->IsStriped() || fa->numReplicas <= 0 || chunks.empty()) {
+        return false;
+    }
+    bool       sawUnrecoverableSuffixFlag = false;
+    chunkOff_t truncateOffset             = -1;
+    chunkId_t  lastRecoverableChunkId     = -1;
+    StTmp<Servers> serversTmp(mServers3Tmp);
+    Servers&       servers = serversTmp.Get();
+    for (vector<MetaChunkInfo*>::const_reverse_iterator it = chunks.rbegin();
+            it != chunks.rend();
+            ++it) {
+        MetaChunkInfo* const ci    = *it;
+        CSMap::Entry* const  entry = mChunkToServerMap.Find(ci->chunkId);
+        bool recoverableFlag = false;
+        if (entry) {
+            servers.clear();
+            mChunkToServerMap.GetServers(*entry, servers);
+            for (Servers::const_iterator si = servers.begin();
+                    si != servers.end();
+                    ++si) {
+                if ((*si)->IsConnected()) {
+                    recoverableFlag = true;
+                    break;
+                }
+            }
+        }
+        if (! recoverableFlag) {
+            sawUnrecoverableSuffixFlag = true;
+            continue;
+        }
+        if (! sawUnrecoverableSuffixFlag) {
+            return false;
+        }
+        truncateOffset = ci->offset + (chunkOff_t)CHUNKSIZE;
+        if (ci->chunkId == chunkId && 0 <= chunkSize &&
+                chunkSize < (chunkOff_t)CHUNKSIZE) {
+            truncateOffset = ci->offset + chunkSize;
+        }
+        lastRecoverableChunkId = ci->chunkId;
+        break;
+    }
+    if (! sawUnrecoverableSuffixFlag) {
+        return false;
+    }
+    if (truncateOffset < 0) {
+        truncateOffset = 0;
+    }
+    if (fa->nextChunkOffset() <= truncateOffset) {
+        return false;
+    }
+    MetaTruncate& op = *(new MetaTruncate());
+    op.fid           = fid;
+    op.offset        = truncateOffset;
+    op.setEofHintFlag = true;
+    KFS_LOG_STREAM_INFO <<
+        "scheduling hdfs-like recovery truncate:"
+        " fid: "        << fid <<
+        " offset: "     << truncateOffset <<
+        " next: "       << fa->nextChunkOffset() <<
+        " trigger: "    << chunkId <<
+        " recoverable: " << lastRecoverableChunkId <<
+    KFS_LOG_EOM;
+    submit_request(&op);
+    return true;
+}
+
 void
 LayoutManager::MakeChunkStableDone(const MetaChunkMakeStable& req)
 {
@@ -10704,7 +10814,7 @@ LayoutManager::MakeChunkStableDone(const MetaChunkMakeStable& req)
             CancelPendingMakeStable(fileId, req.chunkId);
         }
     }
-    KFS_LOG_STREAM_INFO << logPrefix <<
+    KFS_LOG_STREAM_DEBUG << logPrefix <<
         " <" << req.fid << "," << req.chunkId  << ">"
         " fid: "              << fileId <<
         " version: "          << req.chunkVersion  <<
@@ -10716,6 +10826,16 @@ LayoutManager::MakeChunkStableDone(const MetaChunkMakeStable& req)
         " down: "             << numDownServers <<
         " server(s)" <<
     KFS_LOG_EOM;
+    if (updateSizeFlag &&
+            numServers > 0 &&
+            fa->filesize < 0 &&
+            ! fa->IsStriped() &&
+            pinfo->GetChunkInfo()->offset +
+                (chunkOff_t)CHUNKSIZE < fa->nextChunkOffset() &&
+            ScheduleTruncateToLastRecoverableChunk(
+                fileId, req.chunkId, req.chunkSize)) {
+        return;
+    }
     if (! updateSizeFlag ||
             numServers <= 0 ||
             fa->filesize >= 0 ||
diff --git a/src/cc/meta/LayoutManager.h b/src/cc/meta/LayoutManager.h
index 2da315e69..e04e878ae 100644
--- a/src/cc/meta/LayoutManager.h
+++ b/src/cc/meta/LayoutManager.h
@@ -1110,6 +1110,10 @@ class LayoutManager : public ITimeout
     void BeginMakeChunkStableDone(const MetaBeginMakeChunkStable& req);
     void LogMakeChunkStableDone(MetaLogMakeChunkStable& req);
     void MakeChunkStableDone(const MetaChunkMakeStable& req);
+    bool ScheduleTruncateToLastRecoverableChunk(
+        fid_t      fid,
+        chunkId_t  chunkId,
+        chunkOff_t chunkSize);
     void Handle(MetaLogMakeChunkStableDone& req);
     void ReplayPendingMakeStable(
         chunkId_t  chunkId,
@@ -2369,6 +2373,7 @@ class LayoutManager : public ITimeout
 
     double        mMaxSpaceUtilizationThreshold;
     bool          mUseFsTotalSpaceFlag;
+    bool          mHdfsLikeAllocateFlag;
     int64_t       mChunkAllocMinAvailSpace;
 
     int64_t       mCompleteReplicationCheckInterval;
diff --git a/src/cc/meta/LogWriter.cc b/src/cc/meta/LogWriter.cc
index f3899374c..4f0a72c42 100644
--- a/src/cc/meta/LogWriter.cc
+++ b/src/cc/meta/LogWriter.cc
@@ -28,6 +28,7 @@
 #include "LogWriter.h"
 #include "LogTransmitter.h"
 #include "MetaRequest.h"
+#include "NetDispatch.h"
 #include "MetaDataStore.h"
 #include "MetaVrSM.h"
 #include "MetaVrLogSeq.h"
@@ -47,6 +48,7 @@
 #include "kfsio/NetManager.h"
 #include "kfsio/ITimeout.h"
 #include "kfsio/checksum.h"
+#include "kfsio/Base64.h"
 #include "kfsio/PrngIsaac64.h"
 #include "kfsio/NetErrorSimulator.h"
 #include "kfsio/NetManagerWatcher.h"
@@ -1322,6 +1324,236 @@ class LogWriter::Impl :
             mLogAvgUsecsNextTimeUsec += kLogAvgIntervalUsec;
         }
     }
+    void SubmitDoneRequest(
+        MetaRequest& inReq,
+        int64_t      inStartTime,
+        bool&        ioFirstItemFlag)
+    {
+        const int64_t theUsecsNow = ioFirstItemFlag ?
+            inStartTime : microseconds();
+        ioFirstItemFlag = false;
+        if (META_LOG_WRITER_CONTROL != inReq.op) {
+            if (0 == inReq.status) {
+                mLogTimeUsec += inStartTime - inReq.submitTime;
+                mLogTimeOpsCount++;
+            } else {
+                mLogErrorOpsCount++;
+            }
+        }
+        inReq.Submit(theUsecsNow);
+    }
+    void SubmitDoneBatch(
+        vector<MetaRequest*>& inBatch,
+        int64_t               inStartTime,
+        bool&                 ioFirstItemFlag)
+    {
+        if (inBatch.empty()) {
+            return;
+        }
+        const int64_t theUsecsNow = ioFirstItemFlag ?
+            inStartTime : microseconds();
+        ioFirstItemFlag = false;
+        for (vector<MetaRequest*>::iterator it = inBatch.begin();
+                it != inBatch.end();
+                ++it) {
+            MetaRequest& req = **it;
+            if (META_LOG_WRITER_CONTROL != req.op) {
+                if (0 == req.status) {
+                    mLogTimeUsec += inStartTime - req.submitTime;
+                    mLogTimeOpsCount++;
+                } else {
+                    mLogErrorOpsCount++;
+                }
+            }
+            (void)theUsecsNow;
+            if (req.commitPendingFlag) {
+                RequestCommitted(req, fileID.getseed());
+            }
+        }
+        gNetDispatch.DispatchBatch(
+            &inBatch.front(), inBatch.size());
+    }
+    bool IsNamespaceV2CreateIdsLoggable(
+        const MetaRequest& inReq) const
+    {
+        return inReq.NeedsNamespaceV2CreateIds() &&
+            ((MetaRequest::kLogIfOk == inReq.logAction &&
+                0 == inReq.status) ||
+            MetaRequest::kLogAlways == inReq.logAction);
+    }
+    void ReserveNamespaceV2CreateIdsBatch(
+        MetaRequest& inReq)
+    {
+        if (! IsNamespaceV2CreateIdsLoggable(inReq)) {
+            return;
+        }
+        size_t count = 0;
+        for (MetaRequest* ptr = &inReq; ptr &&
+                count < (size_t)mMaxBlockSize &&
+                IsNamespaceV2CreateIdsLoggable(*ptr);
+                ptr = ptr->next) {
+            ++count;
+        }
+        fid_t firstFid = -1;
+        uint64_t firstTxnId = 0;
+        MetaRequest::ReserveNamespaceV2CreateIdsBatch(
+            count, firstFid, firstTxnId);
+        for (MetaRequest* ptr = &inReq; count > 0;
+                ptr = ptr->next, --count, ++firstFid, ++firstTxnId) {
+            if (! ptr || ! ptr->SetNamespaceV2CreateIds(
+                    firstFid, firstTxnId)) {
+                panic("namespace v2 create id batch reserve failed");
+            }
+        }
+    }
+    void FlushNamespaceV2Batch(
+        vector<MetaRequest*>& inBatch,
+        int64_t               inStartTime,
+        bool&                 ioFirstItemFlag)
+    {
+        if (inBatch.empty()) {
+            return;
+        }
+        uint64_t firstTxnId = 0;
+        uint64_t lastTxnId = 0;
+        for (vector<MetaRequest*>::iterator it = inBatch.begin();
+                it != inBatch.end();
+                ++it) {
+            MetaRequest& req = **it;
+            req.ApplyNamespaceV2Batch(false);
+            if (firstTxnId == 0) {
+                firstTxnId = req.GetNamespaceV2BatchTxnId();
+            }
+            lastTxnId = req.GetNamespaceV2BatchTxnId();
+        }
+        MetaRequest::CommitNamespaceV2Batch(firstTxnId, lastTxnId);
+        SubmitDoneBatch(inBatch, inStartTime, ioFirstItemFlag);
+        inBatch.clear();
+    }
+
+    enum { kNamespaceV2WalBatchMaxCount = 64 };
+
+    bool IsNamespaceV2WalBatchable(
+        const MetaRequest& inReq) const
+    {
+        // Only batch create / mkdir with pre-reserved namespace v2 ids.
+        // Other ops keep the per-record format.
+        return IsNamespaceV2CreateIdsLoggable(inReq) &&
+            (META_CREATE == inReq.op || META_MKDIR == inReq.op);
+    }
+
+    template<typename T>
+    static void AppendLe(std::string& out, T v)
+    {
+        for (size_t i = 0; i < sizeof(T); i++) {
+            out.push_back((char)((uint64_t)v >> (i * 8)));
+        }
+    }
+
+    static void AppendBytes(std::string& out, const char* data, size_t len)
+    {
+        out.append(data, len);
+    }
+
+    bool WriteNamespaceV2WalBatchRecord(
+        ostream&                os,
+        const vector<MetaRequest*>& batch) const
+    {
+        if (batch.empty()) {
+            return true;
+        }
+        // Record format: nv2batch/c/<count>/b/<base64_payload>
+        // The payload is raw bytes in little-endian encoding, base64 encoded.
+        std::string payload;
+        payload.reserve(batch.size() * 64);
+        for (vector<MetaRequest*>::const_iterator it = batch.begin();
+                it != batch.end();
+                ++it) {
+            const MetaRequest& req = **it;
+            if (META_CREATE == req.op) {
+                const MetaCreate& c = static_cast<const MetaCreate&>(req);
+                AppendLe<uint8_t>(payload, 1); // create file
+                AppendLe<int64_t>(payload, (int64_t)c.dir);
+                AppendLe<int64_t>(payload, (int64_t)c.fid);
+                AppendLe<uint64_t>(payload, c.namespaceV2TxnId);
+                AppendLe<uint32_t>(payload, (uint32_t)c.user);
+                AppendLe<uint32_t>(payload, (uint32_t)c.group);
+                AppendLe<uint16_t>(payload, (uint16_t)c.mode);
+                AppendLe<int16_t>(payload, c.numReplicas);
+                AppendLe<int64_t>(payload, c.mtime);
+                const uint16_t nlen = (uint16_t)std::min<size_t>(
+                    0xFFFFu, c.name.size());
+                AppendLe<uint16_t>(payload, nlen);
+                AppendBytes(payload, c.name.data(), nlen);
+            } else if (META_MKDIR == req.op) {
+                const MetaMkdir& m = static_cast<const MetaMkdir&>(req);
+                AppendLe<uint8_t>(payload, 2); // mkdir dir
+                AppendLe<int64_t>(payload, (int64_t)m.dir);
+                AppendLe<int64_t>(payload, (int64_t)m.fid);
+                AppendLe<uint64_t>(payload, m.namespaceV2TxnId);
+                AppendLe<uint32_t>(payload, (uint32_t)m.user);
+                AppendLe<uint32_t>(payload, (uint32_t)m.group);
+                AppendLe<uint16_t>(payload, (uint16_t)m.mode);
+                AppendLe<int16_t>(payload, (int16_t)0);
+                AppendLe<int64_t>(payload, m.mtime);
+                const uint16_t nlen = (uint16_t)std::min<size_t>(
+                    0xFFFFu, m.name.size());
+                AppendLe<uint16_t>(payload, nlen);
+                AppendBytes(payload, m.name.data(), nlen);
+            } else {
+                return false;
+            }
+        }
+        StBufferT<char, 4096> b64Buf;
+        char* const bufPtr = b64Buf.Resize(
+            Base64::GetEncodedMaxBufSize((int)payload.size()));
+        const int b64Len = Base64::Encode(
+            payload.data(), (int)payload.size(), bufPtr, true);
+        if (b64Len <= 0) {
+            return false;
+        }
+        // Bound the record length to avoid exceeding block bytes.
+        const size_t kOverhead = 64;
+        if ((size_t)b64Len + kOverhead > (size_t)std::max(0, mMaxBlockBytes)) {
+            return false;
+        }
+        os << "nv2batch/c/" << batch.size() << "/b/" <<
+            std::string(bufPtr, b64Len) << "\n";
+        return bool(os);
+    }
+
+    bool WriteNamespaceV2WalBatchContRecord(
+        ostream& os) const
+    {
+        // Placeholder record to preserve per-op log sequence numbering.
+        os << "nv2batchc\n";
+        return bool(os);
+    }
+    void SubmitDoneOrBatch(
+        MetaRequest&          inReq,
+        vector<MetaRequest*>& ioBatch,
+        uint64_t&             ioNextBatchTxnId,
+        int64_t               inStartTime,
+        bool&                 ioFirstItemFlag)
+    {
+        const bool batchFlag = inReq.CanBatchApplyNamespaceV2();
+        const uint64_t txnId = batchFlag ?
+            inReq.GetNamespaceV2BatchTxnId() : uint64_t(0);
+        if (batchFlag &&
+                (ioBatch.empty() || txnId == ioNextBatchTxnId)) {
+            ioBatch.push_back(&inReq);
+            ioNextBatchTxnId = txnId + 1;
+            return;
+        }
+        FlushNamespaceV2Batch(ioBatch, inStartTime, ioFirstItemFlag);
+        ioNextBatchTxnId = 0;
+        if (batchFlag) {
+            ioBatch.push_back(&inReq);
+            ioNextBatchTxnId = txnId + 1;
+        } else {
+            SubmitDoneRequest(inReq, inStartTime, ioFirstItemFlag);
+        }
+    }
     virtual void Timeout()
     {
         mDebugHistoryCommittedRing.Process(
@@ -1354,6 +1586,9 @@ class LogWriter::Impl :
             mNetManager.Wakeup();
         }
         Queue theReplayQueue;
+        vector<MetaRequest*> theNamespaceV2Batch;
+        theNamespaceV2Batch.reserve(mMaxBlockSize);
+        uint64_t      theNextBatchTxnId = 0;
         MetaRequest*  thePtr;
         int64_t const theStartTime     = microseconds();
         bool          theFirstItemFlag = true;
@@ -1374,25 +1609,28 @@ class LogWriter::Impl :
                     0 == thePtr->status &&
                     MetaLogWriterControl::kWriteBlock ==
                         static_cast<MetaLogWriterControl*>(thePtr)->type) {
+                FlushNamespaceV2Batch(theNamespaceV2Batch, theStartTime,
+                    theFirstItemFlag);
+                theNextBatchTxnId = 0;
                 // Run after setting replay state.
                 theReplayQueue.PushBack(*thePtr);
             } else if (IsMetaLogWriteOrVrError(thePtr->status) ||
                     thePtr->replayBypassFlag ||
-                    ! mReplayerPtr->submit(*thePtr)) {
-                const int64_t theUsecsNow = theFirstItemFlag ?
-                    theStartTime : microseconds();
-                theFirstItemFlag = false;
-                if (META_LOG_WRITER_CONTROL != thePtr->op) {
-                    if (0 == thePtr->status) {
-                        mLogTimeUsec += theStartTime - theReq.submitTime;
-                        mLogTimeOpsCount++;
-                    } else {
-                        mLogErrorOpsCount++;
-                    }
+                    ! mReplayerPtr->isSubmitQueueEnabled()) {
+                SubmitDoneOrBatch(theReq, theNamespaceV2Batch,
+                    theNextBatchTxnId, theStartTime, theFirstItemFlag);
+            } else {
+                FlushNamespaceV2Batch(theNamespaceV2Batch, theStartTime,
+                    theFirstItemFlag);
+                theNextBatchTxnId = 0;
+                if (! mReplayerPtr->submit(*thePtr)) {
+                    SubmitDoneOrBatch(theReq, theNamespaceV2Batch,
+                        theNextBatchTxnId, theStartTime, theFirstItemFlag);
                 }
-                theReq.Submit(theUsecsNow);
             }
         }
+        FlushNamespaceV2Batch(theNamespaceV2Batch, theStartTime,
+            theFirstItemFlag);
         UpdateLogAvg(theStartTime);
         if (theSetReplayStateFlag) {
             thePtr = theReplayCommitHeadPtr;
@@ -1730,14 +1968,73 @@ class LogWriter::Impl :
                         theFailureInjectedFlag = true;
                         break;
                     }
-                    ++mLastLogSeq.mLogSeq;
-                    thePtr->logseq = mLastLogSeq;
-                    if (! thePtr->WriteLog(theStream, mOmitDefaultsFlag)) {
-                        panic("log writer: invalid request");
-                    }
-                    if (! theStream) {
-                        --mLastLogSeq.mLogSeq;
-                        LogError(*thePtr);
+                    if (IsNamespaceV2WalBatchable(*thePtr)) {
+                        vector<MetaRequest*> batch;
+                        batch.reserve(kNamespaceV2WalBatchMaxCount);
+                        for (MetaRequest* ptr = thePtr;
+                                ptr && batch.size() < kNamespaceV2WalBatchMaxCount &&
+                                    IsNamespaceV2WalBatchable(*ptr) &&
+                                    (size_t)(mLastLogSeq.mLogSeq - mNextLogSeq.mLogSeq +
+                                        batch.size() + 1) < (size_t)mMaxBlockSize;
+                                ptr = ptr->next) {
+                            batch.push_back(ptr);
+                        }
+                        if (batch.size() <= 1) {
+                            // Let the non-batch path handle it.
+                            batch.clear();
+                        }
+                        if (! batch.empty() && ! theStream) {
+                            batch.clear();
+                        }
+                        if (! batch.empty()) {
+                        // Reserve create ids for the whole contiguous batch.
+                        ReserveNamespaceV2CreateIdsBatch(*thePtr);
+                        // Assign a unique log sequence to each op as usual.
+                        for (vector<MetaRequest*>::iterator it = batch.begin();
+                                it != batch.end();
+                                ++it) {
+                            ++mLastLogSeq.mLogSeq;
+                            (*it)->logseq = mLastLogSeq;
+                            if (! (*it)->PrepareLog()) {
+                                panic("log writer: invalid request");
+                            }
+                        }
+                        if (! WriteNamespaceV2WalBatchRecord(theStream, batch)) {
+                            panic("log writer: invalid namespace v2 WAL batch record");
+                        }
+                        for (size_t i = 1; i < batch.size(); i++) {
+                            if (! WriteNamespaceV2WalBatchContRecord(theStream)) {
+                                panic("log writer: invalid namespace v2 WAL batch cont record");
+                            }
+                        }
+                        // Skip the rest of the batch: the for-loop will ++thePtr,
+                        // so stop at the last element.
+                        thePtr = batch.back();
+                        } else {
+                            ++mLastLogSeq.mLogSeq;
+                            thePtr->logseq = mLastLogSeq;
+                            ReserveNamespaceV2CreateIdsBatch(*thePtr);
+                            if (! thePtr->PrepareLog() ||
+                                    ! thePtr->WriteLog(theStream, mOmitDefaultsFlag)) {
+                                panic("log writer: invalid request");
+                            }
+                            if (! theStream) {
+                                --mLastLogSeq.mLogSeq;
+                                LogError(*thePtr);
+                            }
+                        }
+                    } else {
+                        ++mLastLogSeq.mLogSeq;
+                        thePtr->logseq = mLastLogSeq;
+                        ReserveNamespaceV2CreateIdsBatch(*thePtr);
+                        if (! thePtr->PrepareLog() ||
+                                ! thePtr->WriteLog(theStream, mOmitDefaultsFlag)) {
+                            panic("log writer: invalid request");
+                        }
+                        if (! theStream) {
+                            --mLastLogSeq.mLogSeq;
+                            LogError(*thePtr);
+                        }
                     }
                 }
                 if (theEndBlockSeq <= mLastLogSeq.mLogSeq ||
diff --git a/src/cc/meta/MetaRequest.cc b/src/cc/meta/MetaRequest.cc
index a5e11851b..fbcd4fc0b 100644
--- a/src/cc/meta/MetaRequest.cc
+++ b/src/cc/meta/MetaRequest.cc
@@ -37,6 +37,7 @@
 #include "ClientSM.h"
 #include "Replay.h"
 #include "MetaVrOps.h"
+#include "NamespaceV2.h"
 
 #include "kfsio/Globals.h"
 #include "kfsio/checksum.h"
@@ -86,6 +87,191 @@ static bool    gWormMode = false;
 static string  gChunkmapDumpDir(".");
 static const char* const ftypes[] = { "empty", "file", "dir" };
 
+static bool
+IsNamespaceV2RpcEnabled()
+{
+    const NamespaceV2::Config& cfg = NamespaceV2::GetConfig();
+    return cfg.enabledFlag && cfg.rpcEnabledFlag;
+}
+
+static bool
+UseNamespaceV2RpcPath(
+    const MetaRequest& req)
+{
+    return req.namespaceV2LogFlag ||
+        (IsNamespaceV2RpcEnabled() && ! req.replayFlag);
+}
+
+static NamespaceV2::NamespaceStore&
+GetNamespaceV2StoreLocked()
+{
+    return NamespaceV2::GetStore();
+}
+
+static FileType
+NamespaceV2FileType(
+    NamespaceV2::InodeType type)
+{
+    return type == NamespaceV2::kInodeTypeDir ? KFS_DIR : KFS_FILE;
+}
+
+class NamespaceV2RpcFattr : public MFattr
+{
+public:
+    void Set(
+        const NamespaceV2::LookupResult& attr)
+    {
+        fid                 = attr.fid;
+        type                = NamespaceV2FileType(attr.type);
+        striperType         = KFS_STRIPED_FILE_TYPE_NONE;
+        numReplicas         = type == KFS_FILE && 0 < attr.numReplicas ?
+            attr.numReplicas : 0;
+        numRecoveryStripes  = 0;
+        numStripes          = 0;
+        stripeSize          = 0;
+        mtime               = attr.mtime;
+        ctime               = attr.ctime;
+        atime               = attr.atime;
+        subcount1           = type == KFS_DIR ? attr.fileCount : 0;
+        subcount2           = type == KFS_DIR ? attr.dirCount : 0;
+        filesize            = 0;
+        minSTier            = kKfsSTierMax;
+        maxSTier            = kKfsSTierMax;
+        fattrExtTypes       = kFileAttrExtTypeNone;
+        user                = attr.user;
+        group               = attr.group;
+        mode                = attr.mode;
+        extAttributes.clear();
+    }
+};
+
+static void
+NamespaceV2SetFattr(
+    const NamespaceV2::LookupResult& attr,
+    MFattr&                         fattr)
+{
+    NamespaceV2RpcFattr tmp;
+    tmp.Set(attr);
+    fattr = tmp;
+}
+
+static int
+NamespaceV2ResolveAbsPathLocked(
+    NamespaceV2::NamespaceStore& store,
+    fid_t&                       dir,
+    string&                      name)
+{
+    if (dir != ROOTFID || name.empty() || name[0] != '/' ||
+            name[name.size() - 1] == '/') {
+        return 0;
+    }
+    const size_t nameStart = name.rfind('/');
+    size_t parentEnd = nameStart;
+    while (parentEnd > 0 && name[parentEnd - 1] == '/') {
+        --parentEnd;
+    }
+    const string leaf = name.substr(nameStart + 1);
+    if (leaf.empty()) {
+        return -EINVAL;
+    }
+    if (parentEnd == 0) {
+        name = leaf;
+        return 0;
+    }
+    NamespaceV2::LookupResult parent;
+    const int status = store.LookupPath(ROOTFID,
+        name.substr(0, parentEnd), parent);
+    if (status != 0) {
+        return status;
+    }
+    if (parent.type != NamespaceV2::kInodeTypeDir) {
+        return -ENOTDIR;
+    }
+    dir  = parent.fid;
+    name = leaf;
+    return 0;
+}
+
+static bool
+NamespaceV2HasCreateIds(
+    fid_t    fid,
+    uint64_t txnId)
+{
+    return fid >= 0 && txnId != 0;
+}
+
+static bool
+NamespaceV2NeedsCreateIds(
+    fid_t    fid,
+    uint64_t txnId)
+{
+    return fid < 0 && txnId == 0;
+}
+
+static bool
+NamespaceV2ReserveCreateIds(
+    fid_t&    fid,
+    uint64_t& txnId)
+{
+    if (fid >= 0 && txnId != 0) {
+        return true;
+    }
+    if (fid >= 0 || txnId != 0) {
+        return false;
+    }
+    NamespaceV2::TxnId reservedTxnId = 0;
+    GetNamespaceV2StoreLocked().ReserveCreateIds(fid, reservedTxnId);
+    txnId = reservedTxnId;
+    return fid >= 0 && txnId != 0;
+}
+
+static int
+NamespaceV2ApplyCreateEdit(
+    fid_t                  parentFid,
+    const string&          name,
+    NamespaceV2::InodeType type,
+    fid_t&                 fid,
+    uint64_t&              txnId,
+    kfsUid_t               user,
+    kfsGid_t               group,
+    kfsMode_t              mode,
+    int16_t                numReplicas,
+    int64_t                mtime,
+    bool                   commitFlag,
+    bool                   advanceSeedsFlag = true)
+{
+    if (! NamespaceV2HasCreateIds(fid, txnId)) {
+        return -EINVAL;
+    }
+    return GetNamespaceV2StoreLocked().ApplyCreateTrusted(parentFid, name, type, fid,
+        (NamespaceV2::TxnId)txnId, user, group, mode, numReplicas, mtime,
+        commitFlag, advanceSeedsFlag);
+}
+
+void
+MetaRequest::ReserveNamespaceV2CreateIdsBatch(
+    size_t    count,
+    fid_t&    firstFid,
+    uint64_t& firstTxnId)
+{
+    NamespaceV2::TxnId firstTxn = 0;
+    GetNamespaceV2StoreLocked().ReserveCreateIdsRange(
+        count, firstFid, firstTxn);
+    firstTxnId = firstTxn;
+}
+
+void
+MetaRequest::CommitNamespaceV2Batch(
+    uint64_t firstTxnId,
+    uint64_t lastTxnId)
+{
+    if (firstTxnId != 0 && lastTxnId != 0) {
+        GetNamespaceV2StoreLocked().CommitThroughRange(
+            (NamespaceV2::TxnId)firstTxnId,
+            (NamespaceV2::TxnId)lastTxnId);
+    }
+}
+
 class StIdempotentRequestHandler
 {
 public:
@@ -701,6 +887,16 @@ MetaLookup::handle()
     }
     authType = kAuthenticationTypeUndef; // always reset if op gets here.
     SetEUserAndEGroup(*this);
+    if (UseNamespaceV2RpcPath(*this)) {
+        NamespaceV2::LookupResult attr;
+        NamespaceV2::NamespaceStore& store = GetNamespaceV2StoreLocked();
+        status = (dir == ROOTFID && name == "/") ?
+            store.GetAttr(ROOTFID, attr) : store.Lookup(dir, name, attr);
+        if (status == 0) {
+            NamespaceV2SetFattr(attr, fattr);
+        }
+        return;
+    }
     MetaFattr* fa = 0;
     if ((status = metatree.lookup(dir, name, euser, egroup, fa)) == 0) {
         FattrReply(fa, fattr);
@@ -720,6 +916,14 @@ MetaLookupPath::handle()
         return;
     }
     SetEUserAndEGroup(*this);
+    if (UseNamespaceV2RpcPath(*this)) {
+        NamespaceV2::LookupResult attr;
+        status = GetNamespaceV2StoreLocked().LookupPath(root, path, attr);
+        if (status == 0) {
+            NamespaceV2SetFattr(attr, fattr);
+        }
+        return;
+    }
     MetaFattr* fa = 0;
     if ((status = metatree.lookupPath(
             root, path, euser, egroup, fa)) == 0) {
@@ -875,6 +1079,57 @@ MetaIdempotentRequest::IsHandled()
 const string kInvalidChunksPath("/proc/invalid_chunks");
 const string kInvalidChunksPrefix(kInvalidChunksPath + "/");
 
+/* virtual */ bool
+MetaCreate::PrepareLog()
+{
+    return ! namespaceV2LogFlag ||
+        NamespaceV2ReserveCreateIds(fid, namespaceV2TxnId);
+}
+
+/* virtual */ bool
+MetaCreate::NeedsNamespaceV2CreateIds() const
+{
+    return namespaceV2LogFlag && ! replayFlag && ! replayBypassFlag &&
+        status == 0 && NamespaceV2NeedsCreateIds(fid, namespaceV2TxnId);
+}
+
+/* virtual */ bool
+MetaCreate::SetNamespaceV2CreateIds(
+    fid_t    inFid,
+    uint64_t inTxnId)
+{
+    if (! NeedsNamespaceV2CreateIds() || inFid < 0 || inTxnId == 0) {
+        return false;
+    }
+    fid = inFid;
+    namespaceV2TxnId = inTxnId;
+    return true;
+}
+
+/* virtual */ bool
+MetaCreate::CanBatchApplyNamespaceV2() const
+{
+    return namespaceV2LogFlag && ! namespaceV2AppliedFlag &&
+        ! replayFlag && ! replayBypassFlag && status == 0 &&
+        logseq.IsValid() && NamespaceV2HasCreateIds(fid, namespaceV2TxnId);
+}
+
+/* virtual */ uint64_t
+MetaCreate::GetNamespaceV2BatchTxnId() const
+{
+    return namespaceV2TxnId;
+}
+
+/* virtual */ void
+MetaCreate::ApplyNamespaceV2Batch(
+    bool commitFlag)
+{
+    status = NamespaceV2ApplyCreateEdit(dir, name,
+        NamespaceV2::kInodeTypeFile, fid, namespaceV2TxnId,
+        user, group, mode, numReplicas, mtime, commitFlag, false);
+    namespaceV2AppliedFlag = true;
+}
+
 /* virtual */ bool
 MetaCreate::start()
 {
@@ -888,6 +1143,58 @@ MetaCreate::start()
     if (0 != status) {
         return false;
     }
+    if (IsNamespaceV2RpcEnabled()) {
+        const bool kDirFlag = false;
+        if (! CheckCreatePerms(*this, kDirFlag)) {
+            return false;
+        }
+        if (gWormMode && ! IsWormMutationAllowed(name)) {
+            statusMsg = "worm mode";
+            status    = -EPERM;
+            return false;
+        }
+        fid = -1;
+        const bool wasNotObjectStoreFileFlag = 0 < numReplicas;
+        if (striperType != KFS_STRIPED_FILE_TYPE_NONE &&
+                0 < numRecoveryStripes) {
+            numReplicas = min(numReplicas,
+                gLayoutManager.GetMaxReplicasPerRSFile());
+        } else {
+            numReplicas = min(numReplicas,
+                gLayoutManager.GetMaxReplicasPerFile());
+        }
+        if (0 == numReplicas && wasNotObjectStoreFileFlag &&
+                gLayoutManager.IsObjectStoreEnabled()) {
+            striperType        = KFS_STRIPED_FILE_TYPE_NONE;
+            numRecoveryStripes = 0;
+            numStripes         = 0;
+            stripeSize         = 0;
+            if (minSTier < kKfsSTierMax) {
+                maxSTier = minSTier;
+            }
+        }
+        if (maxSTier < minSTier || ! IsValidSTier(minSTier) ||
+                ! IsValidSTier(maxSTier)) {
+            status    = -EINVAL;
+            statusMsg = "invalid storage tier range";
+            return false;
+        }
+        if (minSTier < kKfsSTierMax && 0 == numReplicas &&
+                minSTier != maxSTier) {
+            status    = -EINVAL;
+            statusMsg = "storage tier range is not supported with object store files";
+            return false;
+        }
+        if (! gLayoutManager.Validate(*this)) {
+            if (0 <= status) {
+                status = -EINVAL;
+            }
+            return false;
+        }
+        mtime = microseconds();
+        namespaceV2LogFlag = true;
+        return true;
+    }
     const bool invalChunkFlag = dir == ROOTFID &&
         startsWith(name, kInvalidChunksPrefix);
     if (invalChunkFlag) {
@@ -1019,6 +1326,15 @@ MetaCreate::handle()
     if (IsHandled()) {
         return;
     }
+    if (namespaceV2AppliedFlag) {
+        return;
+    }
+    if (UseNamespaceV2RpcPath(*this)) {
+        status = NamespaceV2ApplyCreateEdit(dir, name,
+            NamespaceV2::kInodeTypeFile, fid, namespaceV2TxnId,
+            user, group, mode, numReplicas, mtime, true);
+        return;
+    }
     fid = 0;
     MetaFattr* fa = 0;
     bool const kToDumpsterFlag = true;
@@ -1057,6 +1373,57 @@ MetaCreate::handle()
     }
 }
 
+/* virtual */ bool
+MetaMkdir::PrepareLog()
+{
+    return ! namespaceV2LogFlag ||
+        NamespaceV2ReserveCreateIds(fid, namespaceV2TxnId);
+}
+
+/* virtual */ bool
+MetaMkdir::NeedsNamespaceV2CreateIds() const
+{
+    return namespaceV2LogFlag && ! replayFlag && ! replayBypassFlag &&
+        status == 0 && NamespaceV2NeedsCreateIds(fid, namespaceV2TxnId);
+}
+
+/* virtual */ bool
+MetaMkdir::SetNamespaceV2CreateIds(
+    fid_t    inFid,
+    uint64_t inTxnId)
+{
+    if (! NeedsNamespaceV2CreateIds() || inFid < 0 || inTxnId == 0) {
+        return false;
+    }
+    fid = inFid;
+    namespaceV2TxnId = inTxnId;
+    return true;
+}
+
+/* virtual */ bool
+MetaMkdir::CanBatchApplyNamespaceV2() const
+{
+    return namespaceV2LogFlag && ! namespaceV2AppliedFlag &&
+        ! replayFlag && ! replayBypassFlag && status == 0 &&
+        logseq.IsValid() && NamespaceV2HasCreateIds(fid, namespaceV2TxnId);
+}
+
+/* virtual */ uint64_t
+MetaMkdir::GetNamespaceV2BatchTxnId() const
+{
+    return namespaceV2TxnId;
+}
+
+/* virtual */ void
+MetaMkdir::ApplyNamespaceV2Batch(
+    bool commitFlag)
+{
+    status = NamespaceV2ApplyCreateEdit(dir, name,
+        NamespaceV2::kInodeTypeDir, fid, namespaceV2TxnId,
+        user, group, mode, 0, mtime, commitFlag, false);
+    namespaceV2AppliedFlag = true;
+}
+
 /* virtual */ bool
 MetaMkdir::start()
 {
@@ -1074,6 +1441,9 @@ MetaMkdir::start()
     if (! CheckCreatePerms(*this, kDirFlag)) {
         return false;
     }
+    if (IsNamespaceV2RpcEnabled()) {
+        namespaceV2LogFlag = true;
+    }
     if (0 == status) {
         mtime = microseconds();
     }
@@ -1086,6 +1456,15 @@ MetaMkdir::handle()
     if (IsHandled()) {
         return;
     }
+    if (namespaceV2AppliedFlag) {
+        return;
+    }
+    if (UseNamespaceV2RpcPath(*this)) {
+        status = NamespaceV2ApplyCreateEdit(dir, name,
+            NamespaceV2::kInodeTypeDir, fid, namespaceV2TxnId,
+            user, group, mode, 0, mtime, true);
+        return;
+    }
     fid = 0;
     MetaFattr* fa = 0;
     status = metatree.mkdir(
@@ -1150,6 +1529,9 @@ MetaRemove::start()
     if (0 == status) {
         mtime = microseconds();
     }
+    if (IsNamespaceV2RpcEnabled()) {
+        namespaceV2LogFlag = true;
+    }
     return (0 == status);
 }
 
@@ -1159,6 +1541,18 @@ MetaRemove::handle()
     if (IsHandled()) {
         return;
     }
+    if (UseNamespaceV2RpcPath(*this)) {
+        NamespaceV2::NamespaceStore& store = GetNamespaceV2StoreLocked();
+        status = NamespaceV2ResolveAbsPathLocked(store, dir, name);
+        if (status == 0) {
+            NamespaceV2::TxnId txnId = 0;
+            status = store.RemoveFile(dir, name, &txnId);
+            if (status == 0) {
+                store.CommitThrough(txnId);
+            }
+        }
+        return;
+    }
     if ((status = LookupAbsPath(dir, name, euser, egroup)) != 0) {
         return;
     }
@@ -1184,6 +1578,9 @@ MetaRmdir::start()
     if (0 == status) {
         mtime = microseconds();
     }
+    if (IsNamespaceV2RpcEnabled()) {
+        namespaceV2LogFlag = true;
+    }
     return (0 == status);
 }
 
@@ -1193,6 +1590,18 @@ MetaRmdir::handle()
     if (IsHandled()) {
         return;
     }
+    if (UseNamespaceV2RpcPath(*this)) {
+        NamespaceV2::NamespaceStore& store = GetNamespaceV2StoreLocked();
+        status = NamespaceV2ResolveAbsPathLocked(store, dir, name);
+        if (status == 0) {
+            NamespaceV2::TxnId txnId = 0;
+            status = store.Rmdir(dir, name, &txnId);
+            if (status == 0) {
+                store.CommitThrough(txnId);
+            }
+        }
+        return;
+    }
     if ((status = LookupAbsPath(dir, name, euser, egroup)) != 0) {
         return;
     }
@@ -1243,6 +1652,71 @@ MetaReaddir::handle()
     }
     numEntries = 0;
     resp.Clear();
+    if (UseNamespaceV2RpcPath(*this)) {
+        NamespaceV2::NamespaceStore& store = GetNamespaceV2StoreLocked();
+        NamespaceV2::LookupResult dirAttr;
+        status = store.GetAttr(dir, dirAttr);
+        if (status != 0) {
+            return;
+        }
+        if (dirAttr.type != NamespaceV2::kInodeTypeDir) {
+            status = -ENOTDIR;
+            return;
+        }
+        SetEUserAndEGroup(*this);
+        NamespaceV2RpcFattr dirFattr;
+        dirFattr.Set(dirAttr);
+        if (! dirFattr.CanRead(euser, egroup)) {
+            status = -EACCES;
+            return;
+        }
+        NamespaceV2::ReaddirResult result;
+        const size_t v2MaxEntries = 0 < maxEntries ? (size_t)maxEntries :
+            numeric_limits<size_t>::max();
+        status = fnameStart.empty() ? store.Readdir(dir, 0,
+            v2MaxEntries, result) : store.ReaddirFromName(dir, fnameStart,
+            v2MaxEntries, result);
+        if (status != 0) {
+            return;
+        }
+        hasMoreEntriesFlag = result.moreEntriesFlag;
+        if (oldFormatFlag && hasMoreEntriesFlag) {
+            status     = -ENOMEM;
+            statusMsg  = "response exceeds max. allowed number of entries"
+                " consider updating kfs client lib";
+            return;
+        }
+        const int extSize = IOBufferData::GetDefaultBufferSize() +
+            int(MAX_FILE_NAME_LENGTH);
+        int maxSize = gLayoutManager.GetMaxResponseSize();
+        if (! oldFormatFlag && extSize * 2 < maxSize) {
+            maxSize -= extSize;
+        }
+        IOBufferWriter writer(resp);
+        size_t i = 0;
+        for (; i < result.entries.size() && writer.GetSize() <= maxSize;
+                ++i) {
+            const string& entryName = result.entries[i].key.name;
+            if (dir == ROOTFID && entryName == "/") {
+                continue;
+            }
+            writer.Write(entryName);
+            writer.Write("\n", 1);
+            ++numEntries;
+        }
+        writer.Close();
+        if (resp.BytesConsumable() > maxSize) {
+            if (oldFormatFlag) {
+                resp.Clear();
+                numEntries = 0;
+                status     = -ENOMEM;
+                statusMsg  = "response exceeds max. size";
+            } else if (i < result.entries.size()) {
+                hasMoreEntriesFlag = true;
+            }
+        }
+        return;
+    }
     vector<MetaDentry*>& v = GetReadDirTmpVec();
     if ((status = fnameStart.empty() ?
             metatree.readdir(dir, v,
@@ -1806,6 +2280,91 @@ MetaReaddirPlus::handle()
             (maxEntries <= 0 || numEntries < maxEntries)) {
         maxEntries = numEntries;
     }
+    if (UseNamespaceV2RpcPath(*this)) {
+        NamespaceV2::NamespaceStore& store = GetNamespaceV2StoreLocked();
+        NamespaceV2::LookupResult dirAttr;
+        status = store.GetAttr(dir, dirAttr);
+        if (status != 0) {
+            return;
+        }
+        if (dirAttr.type != NamespaceV2::kInodeTypeDir) {
+            status = -ENOTDIR;
+            return;
+        }
+        SetEUserAndEGroup(*this);
+        NamespaceV2RpcFattr dirFattr;
+        dirFattr.Set(dirAttr);
+        if (! dirFattr.CanRead(euser, egroup)) {
+            status = -EACCES;
+            return;
+        }
+        noAttrsFlag = ! dirFattr.CanSearch(euser, egroup);
+        NamespaceV2::ReaddirResult result;
+        const size_t v2MaxEntries = 0 < maxEntries ? (size_t)maxEntries :
+            numeric_limits<size_t>::max();
+        status = fnameStart.empty() ? store.Readdir(dir, 0,
+            v2MaxEntries, result) : store.ReaddirFromName(dir, fnameStart,
+            v2MaxEntries, result);
+        if (status != 0) {
+            return;
+        }
+        hasMoreEntriesFlag = result.moreEntriesFlag;
+        if (numEntries < 0 && hasMoreEntriesFlag) {
+            status     = -ENOMEM;
+            statusMsg  = "response exceeds max. allowed number of entries"
+                " consider updating kfs client lib";
+            return;
+        }
+        maxRespSize = max(0, gLayoutManager.GetMaxResponseSize());
+        const int extSize = IOBufferData::GetDefaultBufferSize() +
+            int(MAX_FILE_NAME_LENGTH);
+        const size_t maxSize = (size_t)((numEntries >= 0 &&
+            extSize * 2 < maxRespSize) ? maxRespSize - extSize :
+            maxRespSize);
+        dentries.reserve(result.entries.size() + (fnameStart.empty() ? 2 : 0));
+        omitLastChunkInfoFlag = true;
+        size_t responseSize = 0;
+        if (fnameStart.empty()) {
+            dentries.push_back(DEntry(dirFattr, "."));
+            dentries.push_back(DEntry(dirFattr, ".."));
+            responseSize += 2 * 148 + 3;
+        }
+        size_t i = 0;
+        for (; i < result.entries.size() && responseSize <= maxSize; ++i) {
+            const NamespaceV2::ReaddirResult::Entry& entry =
+                result.entries[i];
+            NamespaceV2::LookupResult attr;
+            if (store.GetAttr(entry.childFid, attr) != 0) {
+                continue;
+            }
+            NamespaceV2RpcFattr fa;
+            fa.Set(attr);
+            const string& entryName = entry.key.name;
+            if (fa.id() == ROOTFID && entryName == "/") {
+                continue;
+            }
+            responseSize += entryName.length() +
+                (fa.type == KFS_DIR ? 148 : 272);
+            dentries.push_back(DEntry(fa, entryName));
+        }
+        if (maxSize < responseSize) {
+            if (numEntries < 0) {
+                status    = -ENOMEM;
+                statusMsg = "response exceeds max. size";
+                dentries.clear();
+                responseSize = 0;
+            } else if (i < result.entries.size()) {
+                hasMoreEntriesFlag = true;
+            }
+        }
+        ioBufPending = (int64_t)responseSize;
+        if (ioBufPending > 0) {
+            gLayoutManager.ChangeIoBufPending(ioBufPending);
+            maxRespSize = (int)max((int64_t)maxRespSize, ioBufPending +
+                IOBufferData::GetDefaultBufferSize());
+        }
+        return;
+    }
     vector<MetaDentry*>& res = GetReadDirTmpVec();
     if ((status = fnameStart.empty() ?
             metatree.readdir(dir, res,
@@ -2011,6 +2570,13 @@ MetaGetalloc::handle()
         return;
     }
     if (err) {
+        if (gLayoutManager.ScheduleTruncateToLastRecoverableChunk(
+                fid, chunkId, chunkOff_t(-1))) {
+            status    = -EAGAIN;
+            statusMsg = "truncating unrecoverable tail chunk: ";
+            AppendDecIntToString(statusMsg, chunkId);
+            return;
+        }
         status    = -EAGAIN;
         statusMsg = "no replicas available chunk: ";
         AppendDecIntToString(statusMsg, chunkId);
@@ -2111,6 +2677,13 @@ MetaGetlayout::handle()
             assert(! fa || cfa == fa);
             if (err && ! continueIfNoReplicasFlag) {
                 resp.Clear();
+                if (gLayoutManager.ScheduleTruncateToLastRecoverableChunk(
+                        fid, l.chunkId, chunkOff_t(-1))) {
+                    status    = -EAGAIN;
+                    statusMsg = "truncating unrecoverable tail chunk: ";
+                    AppendDecIntToString(statusMsg, l.chunkId);
+                    break;
+                }
                 status    = -EHOSTUNREACH;
                 statusMsg = "no replicas available chunk: ";
                 AppendDecIntToString(statusMsg, l.chunkId);
@@ -2182,6 +2755,9 @@ MetaAllocate::dispatch(ClientSM& sm)
 MetaAllocate::handle()
 {
     assert(! MetaRequest::next);
+    if (debugStartUsec <= 0) {
+        debugStartUsec = microseconds();
+    }
     suspended = false;
     if (startedFlag) {
         return;
@@ -2330,8 +2906,10 @@ MetaAllocate::handle()
         }
         return;
     }
+    debugBeforeLayoutUsec = microseconds();
     suspended = true;
     const int ret = gLayoutManager.AllocateChunk(*this, chunkBlock);
+    debugAfterLayoutUsec = microseconds();
     if (0 == ret) {
         return;
     }
@@ -2343,6 +2921,7 @@ MetaAllocate::handle()
 void
 MetaAllocate::LayoutDone(int64_t chunkAllocProcessTime)
 {
+    debugLayoutDoneUsec = microseconds();
     suspended = false;
     if (0 == status) {
         // Check if all servers are still up, and didn't go down
@@ -2385,8 +2964,12 @@ MetaAllocate::LayoutDone(int64_t chunkAllocProcessTime)
     }
     if (0 == status) {
         assert(! MetaRequest::next);
+        debugLogStartUsec = microseconds();
         suspended = true;
         submit_request(new MetaLogChunkAllocate(this));
+        // This log request is submitted while processing another request, so it
+        // can bypass the client thread's end-of-batch log flush trigger.
+        GetLogWriter().ScheduleFlush();
         return;
     }
     const bool kCountAllocTimeFlag = true;
@@ -2609,6 +3192,30 @@ MetaAllocate::Done(bool countAllocTimeFlag, int64_t chunkAllocProcessTime)
         processTime += microseconds() - chunkAllocProcessTime;
     }
     if (! next) {
+        const int64_t now = microseconds();
+        const int64_t totalUsec = debugStartUsec > 0 ? now - debugStartUsec : 0;
+        if (100000 <= totalUsec) {
+            KFS_LOG_STREAM_INFO <<
+                "allocate timing:"
+                " seq: " << opSeqno <<
+                " fid: " << fid <<
+                " chunk: " << chunkId <<
+                " status: " << status <<
+                " total-usec: " << totalUsec <<
+                " pre-layout-usec: " <<
+                    (debugBeforeLayoutUsec > debugStartUsec ?
+                        debugBeforeLayoutUsec - debugStartUsec : 0) <<
+                " layout-call-usec: " <<
+                    (debugAfterLayoutUsec > debugBeforeLayoutUsec ?
+                        debugAfterLayoutUsec - debugBeforeLayoutUsec : 0) <<
+                " wait-chunk-usec: " <<
+                    (debugLayoutDoneUsec > debugAfterLayoutUsec ?
+                        debugLayoutDoneUsec - debugAfterLayoutUsec : 0) <<
+                " log-wait-usec: " <<
+                    (debugLogStartUsec > 0 ? now - debugLogStartUsec : 0) <<
+                " servers: " << servers.size() <<
+            KFS_LOG_EOM;
+        }
         submit_request(this);
         return;
     }
@@ -2938,6 +3545,9 @@ MetaRename::start()
     }
     if (0 == status) {
         mtime = microseconds();
+        if (IsNamespaceV2RpcEnabled()) {
+            namespaceV2LogFlag = true;
+        }
     }
     return (0 == status);
 }
@@ -2952,10 +3562,19 @@ MetaRename::handle()
         // renames are disabled in WORM mode: otherwise, we
         // ocould overwrite an existing file
         srcFid = -1;
-        bool const kToDumpsterFlag = true;
-        status = metatree.rename(dir, oldname, newname,
-            oldpath, overwrite && ! wormModeFlag, euser, egroup,
-            mtime, &srcFid, kToDumpsterFlag);
+        if (UseNamespaceV2RpcPath(*this)) {
+            NamespaceV2::TxnId txnId = 0;
+            status = GetNamespaceV2StoreLocked().Rename(dir, oldname,
+                newname, overwrite && ! wormModeFlag, &txnId, &srcFid);
+            if (status == 0 && txnId != 0) {
+                GetNamespaceV2StoreLocked().CommitThrough(txnId);
+            }
+        } else {
+            bool const kToDumpsterFlag = true;
+            status = metatree.rename(dir, oldname, newname,
+                oldpath, overwrite && ! wormModeFlag, euser, egroup,
+                mtime, &srcFid, kToDumpsterFlag);
+        }
         if (wormModeFlag && -EEXIST == status) {
             statusMsg = "worm mode";
             status    = -EPERM;
@@ -5050,6 +5669,9 @@ MetaAllocate::responseSelf(ReqOstream& os)
         (shortRpcFormatFlag ? "H:" : "Chunk-handle: ")  << chunkId << "\r\n" <<
         (shortRpcFormatFlag ? "V:" : "Chunk-version: ") << (0 == numReplicas ?
             -chunkVersion - 1 : chunkVersion) << "\r\n";
+    if (0 <= leaseId) {
+        os << (shortRpcFormatFlag ? "L:" : "Lease-id: ") << leaseId << "\r\n";
+    }
     if (appendChunk) {
         os << (shortRpcFormatFlag ? "O:" : "Chunk-offset: ") <<
             offset << "\r\n";
diff --git a/src/cc/meta/MetaRequest.h b/src/cc/meta/MetaRequest.h
index f61cc5bcf..8f12ac3c1 100644
--- a/src/cc/meta/MetaRequest.h
+++ b/src/cc/meta/MetaRequest.h
@@ -283,6 +283,7 @@ struct MetaRequest {
     bool            replayFlag;
     bool            commitPendingFlag;
     bool            replayBypassFlag;
+    bool            namespaceV2LogFlag;
     string          clientIp;
     string          clientReportedIp;
     string          nodeId;
@@ -317,6 +318,7 @@ struct MetaRequest {
           replayFlag(false),
           commitPendingFlag(false),
           replayBypassFlag(false),
+          namespaceV2LogFlag(false),
           clientIp(),
           clientReportedIp(),
           nodeId(),
@@ -344,6 +346,16 @@ struct MetaRequest {
     //!< response to be sent back as per the KFS protocol.
     virtual void response(ReqOstream& os, IOBuffer& /* buf */) { response(os); }
     virtual bool log(ostream& file) const;
+    virtual bool PrepareLog() { return true; }
+    virtual bool NeedsNamespaceV2CreateIds() const { return false; }
+    virtual bool SetNamespaceV2CreateIds(
+        fid_t /* fid */, uint64_t /* txnId */) { return false; }
+    static void ReserveNamespaceV2CreateIdsBatch(
+        size_t count, fid_t& firstFid, uint64_t& firstTxnId);
+    virtual bool CanBatchApplyNamespaceV2() const { return false; }
+    virtual uint64_t GetNamespaceV2BatchTxnId() const { return 0; }
+    virtual void ApplyNamespaceV2Batch(bool /* commitFlag */) {}
+    static void CommitNamespaceV2Batch(uint64_t firstTxnId, uint64_t lastTxnId);
     Display Show() const { return Display(*this); }
     virtual void setChunkServer(const ChunkServerPtr& /* cs */) {};
     bool ValidateRequestHeader(
@@ -395,6 +407,7 @@ struct MetaRequest {
         .Def("u", &MetaRequest::euser,               kKfsUserNone)
         .Def("g", &MetaRequest::egroup,              kKfsGroupNone)
         .Def("a", &MetaRequest::authUid,             kKfsUserNone)
+        .Def("V2", &MetaRequest::namespaceV2LogFlag, false)
         .Def("z", &MetaRequest::logseq)
         .Def("x", &MetaRequest::shortRpcFormatFlag,  true)
         ;
@@ -468,6 +481,7 @@ struct MetaRequest {
         replayFlag          = false;
         commitPendingFlag   = false;
         replayBypassFlag    = false;
+        namespaceV2LogFlag = false;
         clientIp = string();
         nodeId = string();
         reqHeaders.Clear();
@@ -695,6 +709,8 @@ struct MetaCreate: public MetaIdempotentRequest {
     string     name;                //!< name to create
     string     ownerName;
     string     groupName;
+    uint64_t   namespaceV2TxnId;
+    bool       namespaceV2AppliedFlag;
     int64_t    mtime;
     MetaCreate()
         : MetaIdempotentRequest(META_CREATE, kLogIfOk),
@@ -714,10 +730,18 @@ struct MetaCreate: public MetaIdempotentRequest {
           name(),
           ownerName(),
           groupName(),
+          namespaceV2TxnId(0),
+          namespaceV2AppliedFlag(false),
           mtime()
         {}
     virtual bool start();
     virtual void handle();
+    virtual bool PrepareLog();
+    virtual bool NeedsNamespaceV2CreateIds() const;
+    virtual bool SetNamespaceV2CreateIds(fid_t fid, uint64_t txnId);
+    virtual bool CanBatchApplyNamespaceV2() const;
+    virtual uint64_t GetNamespaceV2BatchTxnId() const;
+    virtual void ApplyNamespaceV2Batch(bool commitFlag);
     virtual void response(ReqOstream &os);
     virtual ostream& ShowSelf(ostream& os) const
     {
@@ -784,6 +808,8 @@ struct MetaCreate: public MetaIdempotentRequest {
         .Def("SS", &MetaCreate::stripeSize,         int32_t(0))
         .Def("E",  &MetaCreate::exclusive,          false)
         .Def("N",  &MetaCreate::name)
+        .Def("H",  &MetaCreate::fid,                fid_t(-1))
+        .Def("VT", &MetaCreate::namespaceV2TxnId,   uint64_t(0))
         .Def("O",  &MetaCreate::user,               kKfsUserNone)
         .Def("G",  &MetaCreate::group,              kKfsGroupNone)
         .Def("M",  &MetaCreate::mode,               kKfsModeUndef)
@@ -809,6 +835,8 @@ struct MetaMkdir: public MetaIdempotentRequest {
     string     ownerName;
     string     groupName;
     int64_t    mtime;
+    uint64_t   namespaceV2TxnId;
+    bool       namespaceV2AppliedFlag;
     MetaMkdir()
         : MetaIdempotentRequest(META_MKDIR, kLogIfOk),
           dir(-1),
@@ -821,10 +849,18 @@ struct MetaMkdir: public MetaIdempotentRequest {
           name(),
           ownerName(),
           groupName(),
-          mtime()
+          mtime(),
+          namespaceV2TxnId(0),
+          namespaceV2AppliedFlag(false)
         {}
     virtual bool start();
     virtual void handle();
+    virtual bool PrepareLog();
+    virtual bool NeedsNamespaceV2CreateIds() const;
+    virtual bool SetNamespaceV2CreateIds(fid_t fid, uint64_t txnId);
+    virtual bool CanBatchApplyNamespaceV2() const;
+    virtual uint64_t GetNamespaceV2BatchTxnId() const;
+    virtual void ApplyNamespaceV2Batch(bool commitFlag);
     virtual void response(ReqOstream &os);
     virtual ostream& ShowSelf(ostream& os) const
     {
@@ -875,6 +911,8 @@ struct MetaMkdir: public MetaIdempotentRequest {
         return MetaIdempotentRequest::LogIoDef(parser)
         .Def("P",  &MetaMkdir::dir,         fid_t(-1))
         .Def("N",  &MetaMkdir::name                  )
+        .Def("H",  &MetaMkdir::fid,         fid_t(-1))
+        .Def("VT", &MetaMkdir::namespaceV2TxnId, uint64_t(0))
         .Def("U",  &MetaMkdir::user,     kKfsUserNone)
         .Def("G",  &MetaMkdir::group,   kKfsGroupNone)
         .Def("M",  &MetaMkdir::mode,    kKfsModeUndef)
@@ -1358,6 +1396,11 @@ struct MetaAllocate: public MetaRequest, public  KfsCallbackObj {
     bool                 allChunkServersShortRpcFlag;
     bool                 logChunkVersionChangeFailedFlag;
     bool                 stoppedServicingFlag;
+    int64_t              debugStartUsec;
+    int64_t              debugBeforeLayoutUsec;
+    int64_t              debugAfterLayoutUsec;
+    int64_t              debugLayoutDoneUsec;
+    int64_t              debugLogStartUsec;
     TokenSeq             tokenSeq;
     time_t               issuedTime;
     int                  validForTime;
@@ -1404,6 +1447,11 @@ struct MetaAllocate: public MetaRequest, public  KfsCallbackObj {
           allChunkServersShortRpcFlag(false),
           logChunkVersionChangeFailedFlag(false),
           stoppedServicingFlag(false),
+          debugStartUsec(0),
+          debugBeforeLayoutUsec(0),
+          debugAfterLayoutUsec(0),
+          debugLayoutDoneUsec(0),
+          debugLogStartUsec(0),
           tokenSeq(),
           issuedTime(),
           validForTime(0),
diff --git a/src/cc/meta/NamespaceV2.cc b/src/cc/meta/NamespaceV2.cc
new file mode 100644
index 000000000..eac86c228
--- /dev/null
+++ b/src/cc/meta/NamespaceV2.cc
@@ -0,0 +1,2969 @@
+//---------------------------------------------------------- -*- Mode: C++ -*-
+// $Id$
+//
+// Memory-native namespace scaffolding for RFC-0001.
+//
+// Copyright 2026 Quantcast Corporation. All rights reserved.
+//
+// This file is part of Kosmos File System (KFS).
+//
+// Licensed under the Apache License, Version 2.0.
+//
+//----------------------------------------------------------------------------
+
+#include "NamespaceV2.h"
+
+#include "common/Properties.h"
+#include "common/hsieh_hash.h"
+#include "common/time.h"
+#include "qcdio/QCMutex.h"
+
+#include <algorithm>
+#include <errno.h>
+#include <functional>
+#include <utility>
+#include <limits>
+#include <istream>
+#include <ostream>
+#include <sstream>
+
+namespace KFS
+{
+namespace NamespaceV2
+{
+
+namespace
+{
+    const TxnId kNoTxn = 0;
+    Config      sConfig;
+    enum { kNamespaceV2ShardCount = 1024 };
+
+    size_t
+    GetLockShard(
+        fid_t fid)
+    {
+        return (size_t)((uint64_t)fid * 11400714819323198485ULL) %
+            kNamespaceV2ShardCount;
+    }
+
+    QCMutex&
+    GetTxnMutex()
+    {
+        static QCMutex sMutex;
+        return sMutex;
+    }
+
+    QCMutex*
+    GetDirShardMutexes()
+    {
+        static QCMutex sLocks[kNamespaceV2ShardCount];
+        return sLocks;
+    }
+
+    QCMutex*
+    GetInodeShardMutexes()
+    {
+        static QCMutex sLocks[kNamespaceV2ShardCount];
+        return sLocks;
+    }
+
+    QCMutex&
+    GetDirShardMutex(
+        fid_t fid)
+    {
+        return GetDirShardMutexes()[GetLockShard(fid)];
+    }
+
+    QCMutex&
+    GetInodeShardMutex(
+        fid_t fid)
+    {
+        return GetInodeShardMutexes()[GetLockShard(fid)];
+    }
+
+    void
+    AddMutex(
+        std::vector<QCMutex*>& locks,
+        QCMutex& mutex)
+    {
+        locks.push_back(&mutex);
+    }
+
+    void
+    AddDirShardMutex(
+        std::vector<QCMutex*>& locks,
+        fid_t fid)
+    {
+        AddMutex(locks, GetDirShardMutex(fid));
+    }
+
+    void
+    AddInodeShardMutex(
+        std::vector<QCMutex*>& locks,
+        fid_t fid)
+    {
+        AddMutex(locks, GetInodeShardMutex(fid));
+    }
+
+    void
+    AddAllDirShardMutexes(
+        std::vector<QCMutex*>& locks)
+    {
+        QCMutex* const mutexes = GetDirShardMutexes();
+        for (size_t i = 0; i < kNamespaceV2ShardCount; ++i) {
+            AddMutex(locks, mutexes[i]);
+        }
+    }
+
+    void
+    AddAllInodeShardMutexes(
+        std::vector<QCMutex*>& locks)
+    {
+        QCMutex* const mutexes = GetInodeShardMutexes();
+        for (size_t i = 0; i < kNamespaceV2ShardCount; ++i) {
+            AddMutex(locks, mutexes[i]);
+        }
+    }
+
+    class ScopedMutex
+    {
+    public:
+        explicit ScopedMutex(
+            QCMutex& mutex)
+            : mMutex(mutex)
+        {
+            mMutex.Lock();
+        }
+        ~ScopedMutex()
+        {
+            mMutex.Unlock();
+        }
+    private:
+        QCMutex& mMutex;
+
+        ScopedMutex(const ScopedMutex&);
+        ScopedMutex& operator=(const ScopedMutex&);
+    };
+
+    class ScopedMutexGroup
+    {
+    public:
+        explicit ScopedMutexGroup(
+            std::vector<QCMutex*> locks)
+            : mLocks(locks)
+        {
+            std::sort(mLocks.begin(), mLocks.end());
+            mLocks.erase(std::unique(mLocks.begin(), mLocks.end()),
+                mLocks.end());
+            for (std::vector<QCMutex*>::iterator it = mLocks.begin();
+                    it != mLocks.end();
+                    ++it) {
+                (*it)->Lock();
+            }
+        }
+        ~ScopedMutexGroup()
+        {
+            for (std::vector<QCMutex*>::reverse_iterator it =
+                        mLocks.rbegin();
+                    it != mLocks.rend();
+                    ++it) {
+                (*it)->Unlock();
+            }
+        }
+    private:
+        std::vector<QCMutex*> mLocks;
+
+        ScopedMutexGroup(const ScopedMutexGroup&);
+        ScopedMutexGroup& operator=(const ScopedMutexGroup&);
+    };
+
+    static void
+    SortUniqueMutexPtrs(
+        QCMutex** locks,
+        size_t&   count)
+    {
+        if (count <= 1) {
+            return;
+        }
+        if (count == 2) {
+            if (locks[0] > locks[1]) {
+                QCMutex* const tmp = locks[0];
+                locks[0] = locks[1];
+                locks[1] = tmp;
+            }
+            return;
+        }
+        if (count == 3) {
+            if (locks[0] > locks[1]) {
+                QCMutex* const tmp = locks[0];
+                locks[0] = locks[1];
+                locks[1] = tmp;
+            }
+            if (locks[1] > locks[2]) {
+                QCMutex* const tmp = locks[1];
+                locks[1] = locks[2];
+                locks[2] = tmp;
+            }
+            if (locks[0] > locks[1]) {
+                QCMutex* const tmp = locks[0];
+                locks[0] = locks[1];
+                locks[1] = tmp;
+            }
+            return;
+        }
+        if (count == 4) {
+            for (size_t i = 1; i < count; ++i) {
+                QCMutex* const key = locks[i];
+                size_t j = i;
+                while (j > 0 && locks[j - 1] > key) {
+                    locks[j] = locks[j - 1];
+                    --j;
+                }
+                locks[j] = key;
+            }
+            return;
+        }
+        std::sort(locks, locks + count);
+    }
+
+    class ScopedSmallMutexGroup
+    {
+    public:
+        ScopedSmallMutexGroup(
+            QCMutex* lock0,
+            QCMutex* lock1,
+            QCMutex* lock2 = 0,
+            QCMutex* lock3 = 0)
+            : mCount(0)
+        {
+            Add(lock0);
+            Add(lock1);
+            Add(lock2);
+            Add(lock3);
+            SortUniqueMutexPtrs(mLocks, mCount);
+            mCount = DedupeMutexPtrs(mLocks, mCount);
+            for (size_t i = 0; i < mCount; ++i) {
+                mLocks[i]->Lock();
+            }
+        }
+        ~ScopedSmallMutexGroup()
+        {
+            while (mCount > 0) {
+                mLocks[--mCount]->Unlock();
+            }
+        }
+    private:
+        static size_t
+        DedupeMutexPtrs(
+            QCMutex** locks,
+            size_t    count)
+        {
+            if (count <= 1) {
+                return count;
+            }
+            size_t out = 1;
+            for (size_t i = 1; i < count; ++i) {
+                if (locks[i] != locks[out - 1]) {
+                    locks[out++] = locks[i];
+                }
+            }
+            return out;
+        }
+        void Add(QCMutex* mutex)
+        {
+            if (mutex) {
+                mLocks[mCount++] = mutex;
+            }
+        }
+
+        QCMutex* mLocks[4];
+        size_t   mCount;
+
+        ScopedSmallMutexGroup(const ScopedSmallMutexGroup&);
+        ScopedSmallMutexGroup& operator=(const ScopedSmallMutexGroup&);
+    };
+
+
+    uint64_t
+    HashName(
+        const std::string& name)
+    {
+        Hsieh_hash_fcn hash;
+        return (uint64_t(hash(name)) << 4);
+    }
+
+    bool
+    IsDeletedAt(
+        const VersionedDirEntry& entry,
+        TxnId                    committedTxn)
+    {
+        return entry.deleteTxn != kNoTxn && entry.deleteTxn <= committedTxn;
+    }
+
+    bool
+    IsLegalName(
+        const std::string& name)
+    {
+        return ! name.empty() && name.size() <= MAX_FILE_NAME_LENGTH &&
+            name.find_first_of("/\n") == std::string::npos;
+    }
+
+    bool
+    IsSupportedInodeType(
+        InodeType type)
+    {
+        return type == kInodeTypeFile || type == kInodeTypeDir ||
+            type == kInodeTypeSymlink;
+    }
+
+    bool
+    IsVisibleOrPendingEntry(
+        const VersionedDirEntry* entry,
+        TxnId                    committedTxn)
+    {
+        if (! entry) {
+            return false;
+        }
+        if (entry->deleteTxn == kNoTxn) {
+            return true;
+        }
+        if (! entry->pendingFlag) {
+            return false;
+        }
+        return committedTxn == kNoTxn || committedTxn < entry->deleteTxn;
+    }
+
+    int
+    ValidateCreateRequest(
+        const std::string& name,
+        InodeType          type)
+    {
+        if (! IsLegalName(name) || ! IsSupportedInodeType(type)) {
+            return -EINVAL;
+        }
+        return 0;
+    }
+
+    int
+    ValidateCreateRequest(
+        const std::string& name,
+        InodeType          type,
+        fid_t              childFid,
+        TxnId              txnId,
+        TxnId              committedTxn)
+    {
+        const int status = ValidateCreateRequest(name, type);
+        if (status != 0) {
+            return status;
+        }
+        if (childFid < 0 || txnId == kNoTxn || txnId <= committedTxn) {
+            return -EINVAL;
+        }
+        return 0;
+    }
+
+    char
+    HexDigit(
+        int value)
+    {
+        return (char)(value < 10 ? 48 + value : 97 + value - 10);
+    }
+
+    int
+    HexValue(
+        char value)
+    {
+        return 48 <= value && value <= 57 ? value - 48 :
+            (97 <= value && value <= 102 ? value - 97 + 10 :
+            (65 <= value && value <= 70 ? value - 65 + 10 : -1));
+    }
+
+    std::string
+    EncodeString(
+        const std::string& value)
+    {
+        std::string result;
+        result.reserve(value.size() * 2);
+        for (size_t i = 0; i < value.size(); ++i) {
+            const int byte = (unsigned char)value[i];
+            result.push_back(HexDigit((byte >> 4) & 0xf));
+            result.push_back(HexDigit(byte & 0xf));
+        }
+        return result;
+    }
+
+    std::string
+    EncodeName(
+        const std::string& name)
+    {
+        return EncodeString(name);
+    }
+
+    bool
+    DecodeHexString(
+        const std::string& encoded,
+        std::string&       value)
+    {
+        if (encoded.size() % 2 != 0) {
+            return false;
+        }
+        std::string result;
+        result.reserve(encoded.size() / 2);
+        for (size_t i = 0; i < encoded.size(); i += 2) {
+            const int hi = HexValue(encoded[i]);
+            const int lo = HexValue(encoded[i + 1]);
+            if (hi < 0 || lo < 0) {
+                return false;
+            }
+            result.push_back((char)((hi << 4) | lo));
+        }
+        value.swap(result);
+        return true;
+    }
+
+    bool
+    DecodeName(
+        const std::string& encoded,
+        std::string&       name)
+    {
+        std::string result;
+        if (! DecodeHexString(encoded, result) || ! IsLegalName(result)) {
+            return false;
+        }
+        name.swap(result);
+        return true;
+    }
+
+    bool
+    DecodePath(
+        const std::string& encoded,
+        std::string&       path)
+    {
+        std::string result;
+        if (! DecodeHexString(encoded, result) || result.empty() ||
+                result.find(char(10)) != std::string::npos) {
+            return false;
+        }
+        path.swap(result);
+        return true;
+    }
+
+    int
+    InodeTypeToInt(
+        InodeType type)
+    {
+        return type == kInodeTypeDir ? 1 :
+            (type == kInodeTypeSymlink ? 2 : 0);
+    }
+
+    bool
+    IntToInodeType(
+        int        value,
+        InodeType& type)
+    {
+        if (value == 0) {
+            type = kInodeTypeFile;
+            return true;
+        }
+        if (value == 1) {
+            type = kInodeTypeDir;
+            return true;
+        }
+        if (value == 2) {
+            type = kInodeTypeSymlink;
+            return true;
+        }
+        return false;
+    }
+
+    const char*
+    EditLogRecordTypeName(
+        EditLogRecord::Type type)
+    {
+        return type == EditLogRecord::kCreate ? "create" :
+            (type == EditLogRecord::kRemove ? "remove" :
+            (type == EditLogRecord::kRename ? "rename" : "invalid"));
+    }
+
+    int
+    ValidateEditLogRecord(
+        const EditLogRecord& record)
+    {
+        if (record.txnId == kNoTxn || record.parentFid < 0) {
+            return -EINVAL;
+        }
+        if (record.type == EditLogRecord::kCreate) {
+            if (record.fid < 0 || ! IsLegalName(record.name) ||
+                    (record.inodeType != kInodeTypeFile &&
+                    record.inodeType != kInodeTypeDir &&
+                    record.inodeType != kInodeTypeSymlink)) {
+                return -EINVAL;
+            }
+            return 0;
+        }
+        if (record.type == EditLogRecord::kRemove) {
+            return IsLegalName(record.name) ? 0 : -EINVAL;
+        }
+        if (record.type == EditLogRecord::kRename) {
+            if (record.fid < 0 || ! IsLegalName(record.name) ||
+                    record.newPath.empty() ||
+                    record.newPath.find(char(10)) != std::string::npos) {
+                return -EINVAL;
+            }
+            return 0;
+        }
+        return -EINVAL;
+    }
+}
+
+Config::Config()
+    : enabledFlag(false),
+      rpcEnabledFlag(false),
+      dirLargeThreshold(4096),
+      dirPromoteMaxWallMs(1000),
+      dirShardCount(128)
+    {}
+
+    Config
+Config::FromProperties(
+    const Properties& props)
+{
+    Config cfg;
+    cfg.enabledFlag = props.getValue(
+        "metaServer.namespaceV2.enabled", cfg.enabledFlag ? 1 : 0) != 0;
+    cfg.rpcEnabledFlag = props.getValue(
+        "metaServer.namespaceV2.rpcEnabled",
+        cfg.rpcEnabledFlag ? 1 : 0) != 0;
+    cfg.dirLargeThreshold = std::max(1, props.getValue(
+        "metaServer.dir.largeThreshold", cfg.dirLargeThreshold));
+    cfg.dirPromoteMaxWallMs = std::max(1, props.getValue(
+        "metaServer.dir.promoteMaxWallMs", cfg.dirPromoteMaxWallMs));
+    cfg.dirShardCount = std::max(1, props.getValue(
+        "metaServer.namespaceV2.dirShardCount", cfg.dirShardCount));
+    return cfg;
+}
+
+    void
+SetParameters(
+    const Properties& props)
+{
+    sConfig = Config::FromProperties(props);
+}
+
+    const Config&
+GetConfig()
+{
+    return sConfig;
+}
+
+
+    NamespaceStore&
+GetStore()
+{
+    static NamespaceStore* sStorePtr = 0;
+    if (! sStorePtr) {
+        sStorePtr = new NamespaceStore(GetConfig());
+    }
+    return *sStorePtr;
+}
+
+NameKey::NameKey()
+    : hash(0),
+      name()
+    {}
+
+NameKey::NameKey(
+    const std::string& inName)
+    : hash(HashName(inName)),
+      name(inName)
+    {}
+
+NameKey::NameKey(
+    uint64_t           inHash,
+    const std::string& inName)
+    : hash(inHash),
+      name(inName)
+    {}
+
+    bool
+NameKey::operator<(
+    const NameKey& other) const
+{
+    return hash < other.hash || (hash == other.hash && name < other.name);
+}
+
+    bool
+NameKey::operator==(
+    const NameKey& other) const
+{
+    return hash == other.hash && name == other.name;
+}
+
+    size_t
+NameKeyHash::operator()(
+    const NameKey& key) const
+{
+    return size_t(key.hash ^ (key.hash >> 33)) ^
+        (std::hash<std::string>()(key.name) << 1);
+}
+
+VersionedDirEntry::VersionedDirEntry()
+    : childFid(-1),
+      createTxn(kNoTxn),
+      deleteTxn(kNoTxn),
+      pendingFlag(false)
+    {}
+
+VersionedDirEntry::VersionedDirEntry(
+    fid_t childFid,
+    TxnId createTxn)
+    : childFid(childFid),
+      createTxn(createTxn),
+      deleteTxn(kNoTxn),
+      pendingFlag(true)
+    {}
+
+    bool
+VersionedDirEntry::IsVisible(
+    TxnId committedTxn) const
+{
+    return createTxn <= committedTxn && ! IsDeletedAt(*this, committedTxn);
+}
+
+InodeRecord::InodeRecord()
+    : fid(-1),
+      parentFid(-1),
+      type(kInodeTypeFile),
+      createTxn(kNoTxn),
+      deleteTxn(kNoTxn),
+      pendingFlag(false),
+      generation(0),
+      user(kKfsUserRoot),
+      group(kKfsGroupRoot),
+      mode(0),
+      numReplicas(1),
+      mtime(0),
+      ctime(0),
+      atime(0)
+    {}
+
+InodeRecord::InodeRecord(
+    fid_t     inFid,
+    fid_t     inParentFid,
+    InodeType inType,
+    TxnId     inCreateTxn,
+    kfsUid_t  inUser,
+    kfsGid_t  inGroup,
+    kfsMode_t inMode,
+    int16_t   inNumReplicas,
+    int64_t   inMtime)
+    : fid(inFid),
+      parentFid(inParentFid),
+      type(inType),
+      createTxn(inCreateTxn),
+      deleteTxn(kNoTxn),
+      pendingFlag(true),
+      generation(0),
+      user(inUser),
+      group(inGroup),
+      mode(inMode),
+      numReplicas(inNumReplicas),
+      mtime(inMtime),
+      ctime(inMtime),
+      atime(inMtime)
+    {}
+
+    bool
+InodeRecord::IsVisible(
+    TxnId committedTxn) const
+{
+    return createTxn <= committedTxn &&
+        (deleteTxn == kNoTxn || committedTxn < deleteTxn);
+}
+
+    InodeTable::InodeTable()
+    : mTables(kNamespaceV2ShardCount)
+    {}
+
+    bool
+InodeTable::Insert(
+    const InodeRecord& record)
+{
+    Table& table = mTables[GetLockShard(record.fid)];
+    return table.insert(std::make_pair(record.fid, record)).second;
+}
+
+    InodeRecord*
+InodeTable::Find(
+    fid_t fid)
+{
+    Table& table = mTables[GetLockShard(fid)];
+    Table::iterator const it = table.find(fid);
+    return it == table.end() ? 0 : &it->second;
+}
+
+    const InodeRecord*
+InodeTable::Find(
+    fid_t fid) const
+{
+    const Table& table = mTables[GetLockShard(fid)];
+    Table::const_iterator const it = table.find(fid);
+    return it == table.end() ? 0 : &it->second;
+}
+
+    const InodeRecord*
+InodeTable::FindCommitted(
+    fid_t fid,
+    TxnId committedTxn) const
+{
+    const InodeRecord* const record = Find(fid);
+    return record && record->IsVisible(committedTxn) ? record : 0;
+}
+
+    bool
+InodeTable::MarkDeleted(
+    fid_t fid,
+    TxnId deleteTxn)
+{
+    InodeRecord* const record = Find(fid);
+    if (! record || record->deleteTxn != kNoTxn) {
+        return false;
+    }
+    record->deleteTxn   = deleteTxn;
+    record->pendingFlag = true;
+    record->generation++;
+    return true;
+}
+
+    bool
+InodeTable::Move(
+    fid_t fid,
+    fid_t parentFid)
+{
+    InodeRecord* const record = Find(fid);
+    if (! record || record->deleteTxn != kNoTxn) {
+        return false;
+    }
+    record->parentFid = parentFid;
+    record->generation++;
+    return true;
+}
+
+    void
+InodeTable::GetCommitted(
+    TxnId committedTxn,
+    std::vector<InodeRecord>& records) const
+{
+    records.clear();
+    size_t size = 0;
+    for (Tables::const_iterator tableIt = mTables.begin();
+            tableIt != mTables.end();
+            ++tableIt) {
+        size += tableIt->size();
+    }
+    records.reserve(size);
+    for (Tables::const_iterator tableIt = mTables.begin();
+            tableIt != mTables.end();
+            ++tableIt) {
+        for (Table::const_iterator it = tableIt->begin();
+                it != tableIt->end();
+                ++it) {
+            if (it->second.IsVisible(committedTxn)) {
+                records.push_back(it->second);
+            }
+        }
+    }
+    std::sort(records.begin(), records.end(),
+        [](const InodeRecord& lhs, const InodeRecord& rhs) {
+            return lhs.fid < rhs.fid;
+        });
+}
+
+    void
+InodeTable::CommitThrough(
+    TxnId committedTxn)
+{
+    for (Tables::iterator tableIt = mTables.begin();
+            tableIt != mTables.end();
+            ++tableIt) {
+        for (Table::iterator it = tableIt->begin();
+                it != tableIt->end();
+                ++it) {
+            if (it->second.createTxn <= committedTxn &&
+                    (it->second.deleteTxn == kNoTxn ||
+                        it->second.deleteTxn <= committedTxn)) {
+                it->second.pendingFlag = false;
+            }
+        }
+    }
+}
+
+    size_t
+InodeTable::Size() const
+{
+    size_t ret = 0;
+    for (Tables::const_iterator it = mTables.begin();
+            it != mTables.end();
+            ++it) {
+        ret += it->size();
+    }
+    return ret;
+}
+
+ReaddirCookie::ReaddirCookie()
+    : generation(0),
+      layout(kDirStateSmall),
+      hasLastKeyFlag(false),
+      lastKey()
+    {}
+
+ReaddirResult::Entry::Entry(
+    const NameKey& inKey,
+    fid_t          inChildFid)
+    : key(inKey),
+      childFid(inChildFid)
+    {}
+
+ReaddirResult::ReaddirResult()
+    : entries(),
+      moreEntriesFlag(false),
+      nextCookie()
+    {}
+
+CheckpointDirEntry::CheckpointDirEntry(
+    fid_t          inParentFid,
+    const NameKey& inKey,
+    fid_t          inChildFid)
+    : parentFid(inParentFid),
+      key(inKey),
+      childFid(inChildFid)
+    {}
+
+DirNode::DirNode(
+    int largeThreshold,
+    int promoteMaxWallMs)
+    : mState(kDirStateSmall),
+      mGeneration(0),
+      mLargeThreshold(std::max(1, largeThreshold)),
+      mPromoteMaxWallMs(std::max(1, promoteMaxWallMs)),
+      mChildCount(0),
+      mSmall(),
+      mLarge()
+    {}
+
+    DirState
+DirNode::GetCookieLayout() const
+{
+    return mState == kDirStateLarge ? kDirStateLarge : kDirStateSmall;
+}
+
+    DirState
+DirNode::GetState() const
+{
+    return mState;
+}
+
+    uint64_t
+DirNode::GetGeneration() const
+{
+    return mGeneration;
+}
+
+    void
+DirNode::SetGeneration(
+    uint64_t generation)
+{
+    mGeneration = generation;
+}
+
+    size_t
+DirNode::GetChildCount() const
+{
+    return mChildCount;
+}
+
+    bool
+DirNode::IsLarge() const
+{
+    return mState == kDirStateLarge;
+}
+
+    bool
+DirNode::HasVisibleOrPendingName(
+    const NameKey& key,
+    TxnId          committedTxn) const
+{
+    return IsVisibleOrPendingEntry(Find(key), committedTxn);
+}
+
+    bool
+DirNode::HasVisibleOrPendingName(
+    const std::string& name,
+    TxnId              committedTxn) const
+{
+    return HasVisibleOrPendingName(NameKey(name), committedTxn);
+}
+
+    int
+DirNode::InsertPending(
+    const std::string& name,
+    fid_t              childFid,
+    TxnId              txnId,
+    TxnId              committedTxn,
+    bool               replaceDeletedFlag)
+{
+    if (! IsLegalName(name)) {
+        return -EINVAL;
+    }
+    return InsertPending(NameKey(name), childFid, txnId, committedTxn,
+        replaceDeletedFlag);
+}
+
+    int
+DirNode::InsertPending(
+    const NameKey& key,
+    fid_t          childFid,
+    TxnId          txnId,
+    TxnId          committedTxn,
+    bool           replaceDeletedFlag)
+{
+    if (txnId == kNoTxn || childFid < 0) {
+        return -EINVAL;
+    }
+    if (mState == kDirStatePromoting) {
+        return -EBUSY;
+    }
+    VersionedDirEntry* const oldEntry = FindMutable(key);
+    const bool replaceFlag = replaceDeletedFlag && oldEntry &&
+        oldEntry->deleteTxn == txnId;
+    if (! replaceFlag &&
+            IsVisibleOrPendingEntry(oldEntry, committedTxn)) {
+        return -EEXIST;
+    }
+    if (mState == kDirStateSmall && ! replaceFlag &&
+            (int)(mSmall.size() + 1) > mLargeThreshold) {
+        const int status = Promote();
+        if (status != 0) {
+            return status;
+        }
+    }
+    VersionedDirEntry entry(childFid, txnId);
+    if (mState == kDirStateLarge) {
+        mLarge[key] = entry;
+    } else {
+        mSmall[key] = entry;
+        IncrementSmallGeneration();
+    }
+    mChildCount++;
+    return 0;
+}
+
+    int
+DirNode::InsertCommitted(
+    const std::string& name,
+    fid_t              childFid)
+{
+    if (childFid < 0 || ! IsLegalName(name)) {
+        return -EINVAL;
+    }
+    const NameKey key(name);
+    if (Find(key)) {
+        return -EEXIST;
+    }
+    if (mState == kDirStateSmall &&
+            (int)(mSmall.size() + 1) > mLargeThreshold) {
+        const int status = Promote();
+        if (status != 0) {
+            return status;
+        }
+    }
+    VersionedDirEntry entry(childFid, kNoTxn);
+    entry.pendingFlag = false;
+    if (mState == kDirStateLarge) {
+        mLarge[key] = entry;
+    } else {
+        mSmall[key] = entry;
+        IncrementSmallGeneration();
+    }
+    mChildCount++;
+    return 0;
+}
+
+    void
+DirNode::GetCommittedEntries(
+    TxnId committedTxn,
+    std::vector<ReaddirResult::Entry>& entries) const
+{
+    if (mState == kDirStateLarge) {
+        for (LargeEntries::const_iterator it = mLarge.begin();
+                it != mLarge.end();
+                ++it) {
+            if (it->second.IsVisible(committedTxn)) {
+                entries.push_back(ReaddirResult::Entry(
+                    it->first, it->second.childFid));
+            }
+        }
+    } else {
+        std::vector<ReaddirResult::Entry> sorted;
+        sorted.reserve(mSmall.size());
+        for (SmallEntries::const_iterator it = mSmall.begin();
+                it != mSmall.end();
+                ++it) {
+            if (it->second.IsVisible(committedTxn)) {
+                sorted.push_back(ReaddirResult::Entry(
+                    it->first, it->second.childFid));
+            }
+        }
+        std::sort(sorted.begin(), sorted.end(),
+            [](const ReaddirResult::Entry& lhs,
+                    const ReaddirResult::Entry& rhs) {
+                return lhs.key < rhs.key;
+            });
+        entries.insert(entries.end(), sorted.begin(), sorted.end());
+    }
+}
+
+    int
+DirNode::DeletePending(
+    const std::string& name,
+    TxnId              txnId,
+    fid_t*             childFidPtr)
+{
+    if (txnId == kNoTxn) {
+        return -EINVAL;
+    }
+    VersionedDirEntry* const entry = FindMutable(NameKey(name));
+    if (! entry || entry->deleteTxn != kNoTxn) {
+        return -ENOENT;
+    }
+    entry->deleteTxn   = txnId;
+    entry->pendingFlag = true;
+    if (childFidPtr) {
+        *childFidPtr = entry->childFid;
+    }
+    if (mChildCount > 0) {
+        mChildCount--;
+    }
+    if (mState != kDirStateLarge) {
+        IncrementSmallGeneration();
+    }
+    return 0;
+}
+
+    const VersionedDirEntry*
+DirNode::LookupCommitted(
+    const std::string& name,
+    TxnId              committedTxn) const
+{
+    const VersionedDirEntry* const entry = Find(NameKey(name));
+    return entry && entry->IsVisible(committedTxn) ? entry : 0;
+}
+
+    int
+DirNode::ReaddirCommitted(
+    TxnId                 committedTxn,
+    const ReaddirCookie*  cookiePtr,
+    size_t                maxEntries,
+    ReaddirResult&        result) const
+{
+    result = ReaddirResult();
+    if (maxEntries == 0) {
+        return 0;
+    }
+    const DirState cookieLayout = GetCookieLayout();
+    if (cookiePtr) {
+        if (cookiePtr->generation != mGeneration ||
+                cookiePtr->layout != cookieLayout) {
+            return -EINVAL;
+        }
+    }
+    result.nextCookie.generation = mGeneration;
+    result.nextCookie.layout     = cookieLayout;
+    const bool hasLastKeyFlag = cookiePtr && cookiePtr->hasLastKeyFlag;
+    const NameKey lastKey = hasLastKeyFlag ? cookiePtr->lastKey : NameKey();
+    const SmallEntries* const entriesMap =
+        mState == kDirStateLarge ? 0 : &mSmall;
+    const LargeEntries* const largeMap =
+        mState == kDirStateLarge ? &mLarge : 0;
+    std::vector<std::pair<NameKey, VersionedDirEntry> > entries;
+    if (largeMap) {
+        entries.reserve(largeMap->size());
+        for (LargeEntries::const_iterator it = largeMap->begin();
+                it != largeMap->end();
+                ++it) {
+            if (it->second.IsVisible(committedTxn) &&
+                    (! hasLastKeyFlag || lastKey < it->first)) {
+                entries.push_back(*it);
+            }
+        }
+    } else if (entriesMap) {
+        entries.reserve(entriesMap->size());
+        for (SmallEntries::const_iterator it = entriesMap->begin();
+                it != entriesMap->end();
+                ++it) {
+            if (it->second.IsVisible(committedTxn) &&
+                    (! hasLastKeyFlag || lastKey < it->first)) {
+                entries.push_back(*it);
+            }
+        }
+    }
+    std::sort(entries.begin(), entries.end(),
+        [](const std::pair<NameKey, VersionedDirEntry>& lhs,
+                const std::pair<NameKey, VersionedDirEntry>& rhs) {
+            return lhs.first < rhs.first;
+        });
+    for (std::vector<std::pair<NameKey, VersionedDirEntry> >::const_iterator
+            it = entries.begin();
+            it != entries.end();
+            ++it) {
+        if (result.entries.size() >= maxEntries) {
+            result.moreEntriesFlag = true;
+            break;
+        }
+        result.entries.push_back(ReaddirResult::Entry(
+            it->first, it->second.childFid));
+    }
+    if (! result.entries.empty()) {
+        result.nextCookie.hasLastKeyFlag = true;
+        result.nextCookie.lastKey = result.entries.back().key;
+    }
+    return 0;
+}
+
+    void
+DirNode::CommitThrough(
+    TxnId committedTxn)
+{
+    if (mState == kDirStateLarge) {
+        for (LargeEntries::iterator it = mLarge.begin();
+                it != mLarge.end();
+                ++it) {
+            if (it->second.createTxn <= committedTxn &&
+                    (it->second.deleteTxn == kNoTxn ||
+                        it->second.deleteTxn <= committedTxn)) {
+                it->second.pendingFlag = false;
+            }
+        }
+    } else {
+        for (SmallEntries::iterator it = mSmall.begin();
+                it != mSmall.end();
+                ++it) {
+            if (it->second.createTxn <= committedTxn &&
+                    (it->second.deleteTxn == kNoTxn ||
+                        it->second.deleteTxn <= committedTxn)) {
+                it->second.pendingFlag = false;
+            }
+        }
+    }
+}
+
+    int
+DirNode::Promote()
+{
+    if (mState == kDirStateLarge) {
+        return 0;
+    }
+    if (mState != kDirStateSmall) {
+        return -EINVAL;
+    }
+    enum { kPromoteBatchSize = 512 };
+    mState = kDirStatePromoting;
+    const int64_t deadlineUsec = microseconds() +
+        (int64_t)mPromoteMaxWallMs * 1000;
+    LargeEntries staging;
+    staging.reserve(mSmall.size());
+    SmallEntries::const_iterator it = mSmall.begin();
+    while (it != mSmall.end()) {
+        for (size_t batch = 0;
+                batch < kPromoteBatchSize && it != mSmall.end();
+                ++batch, ++it) {
+            staging.insert(*it);
+        }
+        if (it != mSmall.end() && microseconds() > deadlineUsec) {
+            mState = kDirStateSmall;
+            return -EBUSY;
+        }
+    }
+    mLarge.swap(staging);
+    mSmall.clear();
+    mState = kDirStateLarge;
+    mGeneration++;
+    return 0;
+}
+
+    VersionedDirEntry*
+DirNode::FindMutable(
+    const NameKey& key)
+{
+    if (mState == kDirStateLarge) {
+        LargeEntries::iterator const it = mLarge.find(key);
+        return it == mLarge.end() ? 0 : &it->second;
+    }
+    SmallEntries::iterator const it = mSmall.find(key);
+    return it == mSmall.end() ? 0 : &it->second;
+}
+
+    const VersionedDirEntry*
+DirNode::Find(
+    const NameKey& key) const
+{
+    if (mState == kDirStateLarge) {
+        LargeEntries::const_iterator const it = mLarge.find(key);
+        return it == mLarge.end() ? 0 : &it->second;
+    }
+    SmallEntries::const_iterator const it = mSmall.find(key);
+    return it == mSmall.end() ? 0 : &it->second;
+}
+
+    void
+DirNode::IncrementSmallGeneration()
+{
+    if (mState != kDirStateLarge) {
+        mGeneration++;
+    }
+}
+
+    DirTable::DirTable()
+    : mTables(kNamespaceV2ShardCount)
+    {}
+
+    bool
+DirTable::Insert(
+    fid_t          dirFid,
+    const DirNode& dir)
+{
+    Table& table = mTables[GetLockShard(dirFid)];
+    return table.insert(std::make_pair(dirFid, dir)).second;
+}
+
+    DirNode*
+DirTable::Find(
+    fid_t dirFid)
+{
+    Table& table = mTables[GetLockShard(dirFid)];
+    Table::iterator const it = table.find(dirFid);
+    return it == table.end() ? 0 : &it->second;
+}
+
+    const DirNode*
+DirTable::Find(
+    fid_t dirFid) const
+{
+    const Table& table = mTables[GetLockShard(dirFid)];
+    Table::const_iterator const it = table.find(dirFid);
+    return it == table.end() ? 0 : &it->second;
+}
+
+    void
+DirTable::CommitThrough(
+    TxnId committedTxn)
+{
+    for (Tables::iterator tableIt = mTables.begin();
+            tableIt != mTables.end();
+            ++tableIt) {
+        for (Table::iterator it = tableIt->begin();
+                it != tableIt->end();
+                ++it) {
+            it->second.CommitThrough(committedTxn);
+        }
+    }
+}
+
+    size_t
+DirTable::Size() const
+{
+    size_t ret = 0;
+    for (Tables::const_iterator it = mTables.begin();
+            it != mTables.end();
+            ++it) {
+        ret += it->size();
+    }
+    return ret;
+}
+
+    void
+DirTable::GetCommittedEntries(
+    TxnId committedTxn,
+    std::vector<CheckpointDirEntry>& entries) const
+{
+    entries.clear();
+    std::vector<fid_t> dirFids;
+    dirFids.reserve(Size());
+    for (Tables::const_iterator tableIt = mTables.begin();
+            tableIt != mTables.end();
+            ++tableIt) {
+        for (Table::const_iterator it = tableIt->begin();
+                it != tableIt->end();
+                ++it) {
+            dirFids.push_back(it->first);
+        }
+    }
+    std::sort(dirFids.begin(), dirFids.end());
+    for (std::vector<fid_t>::const_iterator it = dirFids.begin();
+            it != dirFids.end();
+            ++it) {
+        const DirNode* const dir = Find(*it);
+        if (! dir) {
+            continue;
+        }
+        std::vector<ReaddirResult::Entry> dirEntries;
+        dir->GetCommittedEntries(committedTxn, dirEntries);
+        for (std::vector<ReaddirResult::Entry>::const_iterator entryIt =
+                    dirEntries.begin();
+                entryIt != dirEntries.end();
+                ++entryIt) {
+            entries.push_back(CheckpointDirEntry(
+                *it, entryIt->key, entryIt->childFid));
+        }
+    }
+}
+
+    void
+DirTable::GetDirGenerations(
+    std::vector<std::pair<fid_t, uint64_t> >& generations) const
+{
+    generations.clear();
+    generations.reserve(Size());
+    for (Tables::const_iterator tableIt = mTables.begin();
+            tableIt != mTables.end();
+            ++tableIt) {
+        for (Table::const_iterator it = tableIt->begin();
+                it != tableIt->end();
+                ++it) {
+            generations.push_back(std::make_pair(
+                it->first, it->second.GetGeneration()));
+        }
+    }
+    std::sort(generations.begin(), generations.end(),
+        [](const std::pair<fid_t, uint64_t>& lhs,
+                const std::pair<fid_t, uint64_t>& rhs) {
+            return lhs.first < rhs.first;
+        });
+}
+
+LookupResult::LookupResult()
+    : fid(-1),
+      type(kInodeTypeFile),
+      parentGeneration(0),
+      user(kKfsUserRoot),
+      group(kKfsGroupRoot),
+      mode(0),
+      numReplicas(1),
+      mtime(0),
+      ctime(0),
+      atime(0),
+      fileCount(0),
+      dirCount(0)
+    {}
+
+CreateResult::CreateResult()
+    : fid(-1),
+      txnId(kNoTxn),
+      parentGeneration(0)
+    {}
+
+EditLogRecord::EditLogRecord()
+    : type(kInvalid),
+      txnId(kNoTxn),
+      parentFid(-1),
+      name(),
+      fid(-1),
+      inodeType(kInodeTypeFile),
+      user(kKfsUserRoot),
+      group(kKfsGroupRoot),
+      mode(0),
+      numReplicas(1),
+      mtime(0),
+      newPath(),
+      overwriteFlag(false)
+    {}
+
+    int
+WriteEditLog(
+    std::ostream&       os,
+    const EditLogRecord& record)
+{
+    const int status = ValidateEditLogRecord(record);
+    if (status != 0) {
+        return status;
+    }
+    os << "namespacev2_edit 1 " << EditLogRecordTypeName(record.type) << " ";
+    if (record.type == EditLogRecord::kCreate) {
+        os << record.txnId << " " << record.parentFid << " " <<
+            record.fid << " " << InodeTypeToInt(record.inodeType) <<
+            " " << record.user << " " << record.group << " " <<
+            record.mode << " " << record.numReplicas << " " <<
+            record.mtime << " " << EncodeName(record.name);
+    } else if (record.type == EditLogRecord::kRemove) {
+        os << record.txnId << " " << record.parentFid << " " <<
+            InodeTypeToInt(record.inodeType) << " " <<
+            EncodeName(record.name);
+    } else if (record.type == EditLogRecord::kRename) {
+        os << record.txnId << " " << record.parentFid << " " <<
+            record.fid << " " << (record.overwriteFlag ? 1 : 0) <<
+            " " << EncodeName(record.name) << " " <<
+            EncodeString(record.newPath);
+    }
+    os << char(10);
+    return os.good() ? 0 : -EIO;
+}
+
+    int
+ReadEditLog(
+    const std::string& line,
+    EditLogRecord&    record)
+{
+    std::istringstream is(line);
+    std::string magic;
+    int version = 0;
+    std::string op;
+    if (! (is >> magic >> version >> op) ||
+            magic != "namespacev2_edit" || version != 1) {
+        return -EINVAL;
+    }
+    EditLogRecord tmp;
+    if (op == "create") {
+        int typeValue = -1;
+        std::string encodedName;
+        if (! (is >> tmp.txnId >> tmp.parentFid >> tmp.fid >>
+                typeValue >> tmp.user >> tmp.group >> tmp.mode >>
+                tmp.numReplicas >> tmp.mtime >> encodedName) ||
+                ! IntToInodeType(typeValue, tmp.inodeType) ||
+                ! DecodeName(encodedName, tmp.name)) {
+            return -EINVAL;
+        }
+        tmp.type = EditLogRecord::kCreate;
+    } else if (op == "remove") {
+        int typeValue = -1;
+        std::string encodedName;
+        if (! (is >> tmp.txnId >> tmp.parentFid >> typeValue >>
+                encodedName) ||
+                ! IntToInodeType(typeValue, tmp.inodeType) ||
+                ! DecodeName(encodedName, tmp.name)) {
+            return -EINVAL;
+        }
+        tmp.type = EditLogRecord::kRemove;
+    } else if (op == "rename") {
+        int overwrite = 0;
+        std::string encodedName;
+        std::string encodedPath;
+        if (! (is >> tmp.txnId >> tmp.parentFid >> tmp.fid >>
+                overwrite >> encodedName >> encodedPath) ||
+                ! DecodeName(encodedName, tmp.name) ||
+                ! DecodePath(encodedPath, tmp.newPath)) {
+            return -EINVAL;
+        }
+        tmp.type = EditLogRecord::kRename;
+        tmp.overwriteFlag = overwrite != 0;
+    } else {
+        return -EINVAL;
+    }
+    std::string extra;
+    if (is >> extra) {
+        return -EINVAL;
+    }
+    const int status = ValidateEditLogRecord(tmp);
+    if (status != 0) {
+        return status;
+    }
+    record = tmp;
+    return 0;
+}
+
+NamespaceStore::NamespaceStore(
+    const Config& inConfig,
+    fid_t         rootFid)
+    : mConfig(inConfig),
+      mRootFid(rootFid),
+      mNextFid(rootFid + 1),
+      mNextTxn(kNoTxn),
+      mCommittedTxn(kNoTxn),
+      mInodes(),
+      mDirs()
+{
+    InodeRecord root;
+    root.fid         = rootFid;
+    root.parentFid   = rootFid;
+    root.type        = kInodeTypeDir;
+    root.createTxn   = kNoTxn;
+    root.deleteTxn   = kNoTxn;
+    root.pendingFlag = false;
+    root.mode        = 0777;
+    mInodes.Insert(root);
+    mDirs.Insert(rootFid, DirNode(
+        mConfig.dirLargeThreshold, mConfig.dirPromoteMaxWallMs));
+}
+
+    fid_t
+NamespaceStore::GetRootFid() const
+{
+    return mRootFid;
+}
+
+    TxnId
+NamespaceStore::GetCommittedTxn() const
+{
+    return GetCommittedTxnSnapshot();
+}
+
+    TxnId
+NamespaceStore::GetLastTxn() const
+{
+    return GetLastTxnSnapshot();
+}
+
+    size_t
+NamespaceStore::GetInodeCount() const
+{
+    std::vector<QCMutex*> locks;
+    AddAllInodeShardMutexes(locks);
+    ScopedMutexGroup locker(locks);
+    return mInodes.Size();
+}
+
+    size_t
+NamespaceStore::GetDirCount() const
+{
+    std::vector<QCMutex*> locks;
+    AddAllDirShardMutexes(locks);
+    ScopedMutexGroup locker(locks);
+    return mDirs.Size();
+}
+
+    TxnId
+NamespaceStore::GetCommittedTxnSnapshot() const
+{
+    return mCommittedTxn.load(std::memory_order_acquire);
+}
+
+    TxnId
+NamespaceStore::GetLastTxnSnapshot() const
+{
+    ScopedMutex locker(GetTxnMutex());
+    return mNextTxn;
+}
+
+    void
+NamespaceStore::ReserveCreateIds(
+    fid_t& childFid,
+    TxnId& txnId)
+{
+    AllocateCreateIds(childFid, txnId);
+}
+
+    void
+NamespaceStore::ReserveCreateIdsRange(
+    size_t count,
+    fid_t& firstFid,
+    TxnId& firstTxn)
+{
+    if (count == 0) {
+        firstFid = -1;
+        firstTxn = 0;
+        return;
+    }
+    ScopedMutex locker(GetTxnMutex());
+    firstFid = mNextFid;
+    firstTxn = mNextTxn + 1;
+    mNextFid = firstFid + (fid_t)count;
+    mNextTxn = firstTxn + (TxnId)count - 1;
+}
+
+    void
+NamespaceStore::AllocateCreateIds(
+    fid_t& childFid,
+    TxnId& txnId)
+{
+    ScopedMutex locker(GetTxnMutex());
+    childFid = mNextFid;
+    txnId = mNextTxn + 1;
+    mNextFid = childFid + 1;
+    mNextTxn = txnId;
+}
+
+    TxnId
+NamespaceStore::AllocateTxnId()
+{
+    ScopedMutex locker(GetTxnMutex());
+    const TxnId txnId = mNextTxn + 1;
+    mNextTxn = txnId;
+    return txnId;
+}
+
+    void
+NamespaceStore::AdvanceSeeds(
+    fid_t fid,
+    TxnId txnId)
+{
+    ScopedMutex locker(GetTxnMutex());
+    if (fid >= 0) {
+        mNextFid = std::max(mNextFid, fid + 1);
+    }
+    mNextTxn = std::max(mNextTxn, txnId);
+}
+
+    int
+NamespaceStore::Create(
+    fid_t              parentFid,
+    const std::string& name,
+    InodeType          type,
+    CreateResult*      resultPtr,
+    kfsUid_t           user,
+    kfsGid_t           group,
+    kfsMode_t          mode,
+    int16_t            numReplicas,
+    int64_t            mtime)
+{
+    return ApplyCreatePending(parentFid, name, type, resultPtr,
+        user, group, mode, numReplicas, mtime);
+}
+
+    int
+NamespaceStore::ApplyCreatePending(
+    fid_t              parentFid,
+    const std::string& name,
+    InodeType          type,
+    CreateResult*      resultPtr,
+    kfsUid_t           user,
+    kfsGid_t           group,
+    kfsMode_t          mode,
+    int16_t            numReplicas,
+    int64_t            mtime)
+{
+    int status = ValidateCreateRequest(name, type);
+    if (status != 0) {
+        return status;
+    }
+    const NameKey key(name);
+    {
+        ScopedSmallMutexGroup parentLocker(
+            &GetDirShardMutex(parentFid),
+            &GetInodeShardMutex(parentFid));
+        status = CheckCreateParentName(
+            parentFid, key, GetCommittedTxnSnapshot());
+        if (status != 0) {
+            return status;
+        }
+    }
+    fid_t childFid = -1;
+    TxnId txnId = kNoTxn;
+    ReserveCreateIds(childFid, txnId);
+    status = ApplyCreate(parentFid, name, type, childFid, txnId,
+        user, group, mode, numReplicas, mtime, false, true);
+    if (status == 0 && resultPtr) {
+        resultPtr->fid = childFid;
+        resultPtr->txnId = txnId;
+        const DirNode* const dir = mDirs.Find(parentFid);
+        resultPtr->parentGeneration = dir ? dir->GetGeneration() : 0;
+    }
+    return status;
+}
+
+
+    int
+NamespaceStore::CreateSelf(
+    fid_t              parentFid,
+    const std::string& name,
+    InodeType          type,
+    fid_t              childFid,
+    TxnId              txnId,
+    CreateResult*      resultPtr,
+    kfsUid_t           user,
+    kfsGid_t           group,
+    kfsMode_t          mode,
+    int16_t            numReplicas,
+    int64_t            mtime,
+    TxnId              committedTxn)
+{
+    int status = ValidateCreateRequest(
+        name, type, childFid, txnId, committedTxn);
+    if (status != 0) {
+        return status;
+    }
+    return CreateSelf(parentFid, NameKey(name), type, childFid, txnId,
+        resultPtr, user, group, mode, numReplicas, mtime, committedTxn);
+}
+
+    int
+NamespaceStore::CreateSelf(
+    fid_t              parentFid,
+    const NameKey&     key,
+    InodeType          type,
+    fid_t              childFid,
+    TxnId              txnId,
+    CreateResult*      resultPtr,
+    kfsUid_t           user,
+    kfsGid_t           group,
+    kfsMode_t          mode,
+    int16_t            numReplicas,
+    int64_t            mtime,
+    TxnId              committedTxn)
+{
+    if (childFid < 0 || txnId == kNoTxn || txnId <= committedTxn ||
+            ! IsSupportedInodeType(type)) {
+        return -EINVAL;
+    }
+    if (mInodes.Find(childFid)) {
+        return -EEXIST;
+    }
+    DirNode* dir = 0;
+    int status = ResolveCreateParentDir(parentFid, committedTxn, dir);
+    if (status != 0) {
+        return status;
+    }
+    status = dir->InsertPending(key, childFid, txnId, committedTxn);
+    if (status != 0) {
+        return status;
+    }
+    if (! mInodes.Insert(InodeRecord(childFid, parentFid, type, txnId,
+            user, group, mode, numReplicas, mtime))) {
+        return -EEXIST;
+    }
+    if (type == kInodeTypeDir &&
+            ! mDirs.Insert(childFid, DirNode(
+                mConfig.dirLargeThreshold, mConfig.dirPromoteMaxWallMs))) {
+        return -EEXIST;
+    }
+    if (resultPtr) {
+        resultPtr->fid = childFid;
+        resultPtr->txnId = txnId;
+        resultPtr->parentGeneration = dir->GetGeneration();
+    }
+    return 0;
+}
+
+    int
+NamespaceStore::CreateSelfTrusted(
+    fid_t              parentFid,
+    const NameKey&     key,
+    InodeType          type,
+    fid_t              childFid,
+    TxnId              txnId,
+    CreateResult*      resultPtr,
+    kfsUid_t           user,
+    kfsGid_t           group,
+    kfsMode_t          mode,
+    int16_t            numReplicas,
+    int64_t            mtime,
+    TxnId              committedTxn)
+{
+    if (childFid < 0 || txnId == kNoTxn || txnId <= committedTxn ||
+            ! IsSupportedInodeType(type)) {
+        return -EINVAL;
+    }
+    if (mInodes.Find(childFid)) {
+        return -EEXIST;
+    }
+    DirNode* const dir = mDirs.Find(parentFid);
+    if (! dir) {
+        return -ENOENT;
+    }
+    int status = dir->InsertPending(key, childFid, txnId, committedTxn);
+    if (status != 0) {
+        return status;
+    }
+    if (! mInodes.Insert(InodeRecord(childFid, parentFid, type, txnId,
+            user, group, mode, numReplicas, mtime))) {
+        return -EEXIST;
+    }
+    if (type == kInodeTypeDir &&
+            ! mDirs.Insert(childFid, DirNode(
+                mConfig.dirLargeThreshold, mConfig.dirPromoteMaxWallMs))) {
+        return -EEXIST;
+    }
+    if (resultPtr) {
+        resultPtr->fid = childFid;
+        resultPtr->txnId = txnId;
+        resultPtr->parentGeneration = dir->GetGeneration();
+    }
+    return 0;
+}
+
+    int
+NamespaceStore::Lookup(
+    fid_t              parentFid,
+    const std::string& name,
+    LookupResult&      result) const
+{
+    result = LookupResult();
+    if (parentFid == mRootFid && name == "/") {
+        return GetAttr(mRootFid, result);
+    }
+
+    for (int retry = 0; retry < 4; ++retry) {
+        fid_t childFid = -1;
+        {
+            ScopedSmallMutexGroup parentLocker(
+                &GetDirShardMutex(parentFid),
+                &GetInodeShardMutex(parentFid));
+            const TxnId committedTxn = GetCommittedTxnSnapshot();
+            if (! FindCommittedDir(parentFid, committedTxn)) {
+                return -ENOENT;
+            }
+            const DirNode* const dir = mDirs.Find(parentFid);
+            if (! dir) {
+                return -ENOENT;
+            }
+            const VersionedDirEntry* const entry =
+                dir->LookupCommitted(name, committedTxn);
+            if (! entry) {
+                return -ENOENT;
+            }
+            childFid = entry->childFid;
+        }
+
+        ScopedSmallMutexGroup locker(
+            &GetDirShardMutex(parentFid),
+            &GetInodeShardMutex(parentFid),
+            &GetInodeShardMutex(childFid),
+            &GetDirShardMutex(childFid));
+        const TxnId committedTxn = GetCommittedTxnSnapshot();
+        if (! FindCommittedDir(parentFid, committedTxn)) {
+            return -ENOENT;
+        }
+        const DirNode* const dir = mDirs.Find(parentFid);
+        if (! dir) {
+            return -ENOENT;
+        }
+        const VersionedDirEntry* const entry =
+            dir->LookupCommitted(name, committedTxn);
+        if (! entry) {
+            return -ENOENT;
+        }
+        if (entry->childFid != childFid) {
+            continue;
+        }
+        const InodeRecord* const inode =
+            mInodes.FindCommitted(childFid, committedTxn);
+        if (! inode) {
+            return -ENOENT;
+        }
+        return FillLookupResult(*inode, dir->GetGeneration(), result);
+    }
+    return -EAGAIN;
+}
+
+    int
+NamespaceStore::LookupPath(
+    fid_t              rootFid,
+    const std::string& path,
+    LookupResult&      result) const
+{
+    result = LookupResult();
+    if (path.empty()) {
+        return -EINVAL;
+    }
+    fid_t curFid = (! path.empty() && path[0] == '/') ? mRootFid : rootFid;
+    int status = GetAttr(curFid, result);
+    if (status != 0) {
+        return status;
+    }
+    size_t pos = 0;
+    while (pos < path.size()) {
+        while (pos < path.size() && path[pos] == '/') {
+            ++pos;
+        }
+        if (pos >= path.size()) {
+            break;
+        }
+        const size_t start = pos;
+        while (pos < path.size() && path[pos] != '/') {
+            ++pos;
+        }
+        const std::string name = path.substr(start, pos - start);
+        if (name == ".") {
+            continue;
+        }
+        if (result.type != kInodeTypeDir) {
+            return -ENOTDIR;
+        }
+        status = Lookup(curFid, name, result);
+        if (status != 0) {
+            return status;
+        }
+        curFid = result.fid;
+    }
+    return 0;
+}
+
+    int
+NamespaceStore::GetAttr(
+    fid_t         fid,
+    LookupResult& result) const
+{
+    result = LookupResult();
+    for (int retry = 0; retry < 4; ++retry) {
+        fid_t parentFid = -1;
+        {
+            std::vector<QCMutex*> inodeLocks;
+            AddInodeShardMutex(inodeLocks, fid);
+            ScopedMutexGroup inodeLocker(inodeLocks);
+            const TxnId committedTxn = GetCommittedTxnSnapshot();
+            const InodeRecord* const inode =
+                mInodes.FindCommitted(fid, committedTxn);
+            if (! inode) {
+                return -ENOENT;
+            }
+            parentFid = inode->parentFid;
+        }
+
+        std::vector<QCMutex*> locks;
+        AddInodeShardMutex(locks, fid);
+        AddDirShardMutex(locks, fid);
+        AddDirShardMutex(locks, parentFid);
+        ScopedMutexGroup locker(locks);
+        const TxnId committedTxn = GetCommittedTxnSnapshot();
+        const InodeRecord* const inode =
+            mInodes.FindCommitted(fid, committedTxn);
+        if (! inode) {
+            return -ENOENT;
+        }
+        if (inode->parentFid != parentFid) {
+            continue;
+        }
+        uint64_t parentGeneration = 0;
+        const DirNode* const parentDir = mDirs.Find(parentFid);
+        if (parentDir) {
+            parentGeneration = parentDir->GetGeneration();
+        }
+        return FillLookupResult(*inode, parentGeneration, result);
+    }
+    return -EAGAIN;
+}
+
+    int
+NamespaceStore::Readdir(
+    fid_t                 dirFid,
+    const ReaddirCookie*  cookiePtr,
+    size_t                maxEntries,
+    ReaddirResult&        result) const
+{
+    std::vector<QCMutex*> locks;
+    AddDirShardMutex(locks, dirFid);
+    AddInodeShardMutex(locks, dirFid);
+    ScopedMutexGroup locker(locks);
+    const TxnId committedTxn = GetCommittedTxnSnapshot();
+    if (! FindCommittedDir(dirFid, committedTxn)) {
+        return -ENOENT;
+    }
+    const DirNode* const dir = mDirs.Find(dirFid);
+    return dir ? dir->ReaddirCommitted(committedTxn, cookiePtr,
+        maxEntries, result) : -ENOENT;
+}
+
+    int
+NamespaceStore::ReaddirFromName(
+    fid_t              dirFid,
+    const std::string& name,
+    size_t             maxEntries,
+    ReaddirResult&     result) const
+{
+    std::vector<QCMutex*> locks;
+    AddDirShardMutex(locks, dirFid);
+    AddInodeShardMutex(locks, dirFid);
+    ScopedMutexGroup locker(locks);
+    const TxnId committedTxn = GetCommittedTxnSnapshot();
+    if (! FindCommittedDir(dirFid, committedTxn)) {
+        return -ENOENT;
+    }
+    const DirNode* const dir = mDirs.Find(dirFid);
+    if (! dir) {
+        return -ENOENT;
+    }
+    ReaddirCookie cookie;
+    cookie.generation = dir->GetGeneration();
+    cookie.layout = dir->GetState();
+    cookie.hasLastKeyFlag = ! name.empty();
+    cookie.lastKey = NameKey(name);
+    return dir->ReaddirCommitted(committedTxn,
+        cookie.hasLastKeyFlag ? &cookie : 0, maxEntries, result);
+}
+
+    int
+NamespaceStore::Remove(
+    fid_t              parentFid,
+    const std::string& name,
+    TxnId*             txnIdPtr)
+{
+    return RemoveSelf(parentFid, name, kInodeTypeFile, false, kNoTxn,
+        txnIdPtr);
+}
+
+    int
+NamespaceStore::RemoveFile(
+    fid_t              parentFid,
+    const std::string& name,
+    TxnId*             txnIdPtr)
+{
+    return RemoveSelf(parentFid, name, kInodeTypeFile, false, kNoTxn,
+        txnIdPtr);
+}
+
+    int
+NamespaceStore::Rmdir(
+    fid_t              parentFid,
+    const std::string& name,
+    TxnId*             txnIdPtr)
+{
+    return RemoveSelf(parentFid, name, kInodeTypeDir, true, kNoTxn,
+        txnIdPtr);
+}
+
+    int
+NamespaceStore::ResolveRenameTarget(
+    fid_t              baseDirFid,
+    const std::string& newPath,
+    fid_t&             dstParentFid,
+    std::string&       dstName) const
+{
+    dstParentFid = -1;
+    dstName.clear();
+    if (newPath.empty() || newPath[newPath.size() - 1] == '/') {
+        return -EINVAL;
+    }
+    const std::string::size_type rslash = newPath.rfind('/');
+    if (rslash == std::string::npos) {
+        dstParentFid = baseDirFid;
+        dstName = newPath;
+        return 0;
+    }
+    LookupResult parent;
+    const int status = LookupPath(baseDirFid,
+        newPath.substr(0, std::max(size_t(1), rslash)), parent);
+    if (status != 0) {
+        return status;
+    }
+    if (parent.type != kInodeTypeDir) {
+        return -ENOTDIR;
+    }
+    dstParentFid = parent.fid;
+    dstName = newPath.substr(rslash + 1);
+    return dstName.empty() ? -EINVAL : 0;
+}
+
+    bool
+NamespaceStore::IsDescendant(
+    fid_t ancestorFid,
+    fid_t dirFid,
+    TxnId committedTxn) const
+{
+    fid_t curFid = dirFid;
+    for (size_t guard = 0; guard <= mInodes.Size(); ++guard) {
+        if (curFid == ancestorFid) {
+            return true;
+        }
+        if (curFid == mRootFid) {
+            return false;
+        }
+        const InodeRecord* const inode =
+            mInodes.FindCommitted(curFid, committedTxn);
+        if (! inode || inode->parentFid == curFid) {
+            return false;
+        }
+        curFid = inode->parentFid;
+    }
+    return false;
+}
+
+    int
+NamespaceStore::Rename(
+    fid_t              parentFid,
+    const std::string& oldName,
+    const std::string& newPath,
+    bool               overwriteFlag,
+    TxnId*             txnIdPtr,
+    fid_t*             srcFidPtr)
+{
+    return RenameSelf(parentFid, oldName, newPath, overwriteFlag,
+        kNoTxn, txnIdPtr, srcFidPtr);
+}
+
+    int
+NamespaceStore::RenameSelf(
+    fid_t              parentFid,
+    const std::string& oldName,
+    const std::string& newPath,
+    bool               overwriteFlag,
+    TxnId              txnId,
+    TxnId*             txnIdPtr,
+    fid_t*             srcFidPtr)
+{
+    if (txnIdPtr) {
+        *txnIdPtr = kNoTxn;
+    }
+    if (srcFidPtr) {
+        *srcFidPtr = -1;
+    }
+    if (! IsLegalName(oldName)) {
+        return -EINVAL;
+    }
+
+    fid_t dstParentFid = -1;
+    std::string dstName;
+    int status = ResolveRenameTarget(parentFid, newPath,
+        dstParentFid, dstName);
+    if (status != 0) {
+        return status;
+    }
+    if (! IsLegalName(dstName)) {
+        return -EINVAL;
+    }
+
+    for (int retry = 0; retry < 4; ++retry) {
+        fid_t phaseDstFid = -1;
+        InodeType phaseDstType = kInodeTypeFile;
+        bool phaseDstExistsFlag = false;
+        {
+            std::vector<QCMutex*> phaseLocks;
+            AddAllInodeShardMutexes(phaseLocks);
+            AddDirShardMutex(phaseLocks, parentFid);
+            AddDirShardMutex(phaseLocks, dstParentFid);
+            ScopedMutexGroup phaseLocker(phaseLocks);
+            const TxnId committedTxn = GetCommittedTxnSnapshot();
+            if (txnId != kNoTxn && txnId <= committedTxn) {
+                return -EINVAL;
+            }
+            if (! FindCommittedDir(parentFid, committedTxn)) {
+                return -ENOENT;
+            }
+            DirNode* const srcDir = mDirs.Find(parentFid);
+            if (! srcDir) {
+                return -ENOENT;
+            }
+            const VersionedDirEntry* const srcEntry =
+                srcDir->LookupCommitted(oldName, committedTxn);
+            if (! srcEntry) {
+                return -ENOENT;
+            }
+            const InodeRecord* const srcInode =
+                mInodes.FindCommitted(srcEntry->childFid, committedTxn);
+            if (! srcInode) {
+                return -ENOENT;
+            }
+            if (srcFidPtr) {
+                *srcFidPtr = srcInode->fid;
+            }
+            if (parentFid == dstParentFid && oldName == dstName) {
+                return 0;
+            }
+            if (! FindCommittedDir(dstParentFid, committedTxn)) {
+                return -ENOENT;
+            }
+            DirNode* const dstDir = mDirs.Find(dstParentFid);
+            if (! dstDir) {
+                return -ENOENT;
+            }
+            const VersionedDirEntry* const dstEntry =
+                dstDir->LookupCommitted(dstName, committedTxn);
+            if (dstEntry) {
+                const InodeRecord* const dstInode =
+                    mInodes.FindCommitted(dstEntry->childFid, committedTxn);
+                if (! dstInode) {
+                    return -ENOENT;
+                }
+                phaseDstExistsFlag = true;
+                phaseDstFid = dstInode->fid;
+                phaseDstType = dstInode->type;
+            }
+        }
+
+        std::vector<QCMutex*> locks;
+        AddAllInodeShardMutexes(locks);
+        AddDirShardMutex(locks, parentFid);
+        AddDirShardMutex(locks, dstParentFid);
+        if (phaseDstExistsFlag && phaseDstType == kInodeTypeDir) {
+            AddDirShardMutex(locks, phaseDstFid);
+        }
+        ScopedMutexGroup locker(locks);
+        const TxnId committedTxn = GetCommittedTxnSnapshot();
+        if (txnId != kNoTxn && txnId <= committedTxn) {
+            return -EINVAL;
+        }
+        if (! FindCommittedDir(parentFid, committedTxn)) {
+            return -ENOENT;
+        }
+        DirNode* const srcDir = mDirs.Find(parentFid);
+        if (! srcDir) {
+            return -ENOENT;
+        }
+        const VersionedDirEntry* const srcEntry =
+            srcDir->LookupCommitted(oldName, committedTxn);
+        if (! srcEntry) {
+            return -ENOENT;
+        }
+        InodeRecord* const srcInode = mInodes.Find(srcEntry->childFid);
+        if (! srcInode || ! srcInode->IsVisible(committedTxn)) {
+            return -ENOENT;
+        }
+        if (srcFidPtr) {
+            *srcFidPtr = srcInode->fid;
+        }
+        if (parentFid == dstParentFid && oldName == dstName) {
+            return 0;
+        }
+        if (! FindCommittedDir(dstParentFid, committedTxn)) {
+            return -ENOENT;
+        }
+        if (srcInode->type == kInodeTypeDir &&
+                IsDescendant(srcInode->fid, dstParentFid, committedTxn)) {
+            return -EINVAL;
+        }
+        DirNode* const dstDir = mDirs.Find(dstParentFid);
+        if (! dstDir) {
+            return -ENOENT;
+        }
+        const VersionedDirEntry* const dstEntry =
+            dstDir->LookupCommitted(dstName, committedTxn);
+        InodeRecord* dstInode = 0;
+        if (dstEntry) {
+            dstInode = mInodes.Find(dstEntry->childFid);
+            if (! dstInode || ! dstInode->IsVisible(committedTxn)) {
+                return -ENOENT;
+            }
+            if (! phaseDstExistsFlag || phaseDstFid != dstInode->fid ||
+                    phaseDstType != dstInode->type) {
+                continue;
+            }
+            if (! overwriteFlag) {
+                return -EEXIST;
+            }
+            if (srcInode->type != dstInode->type) {
+                return srcInode->type == kInodeTypeDir ? -ENOTDIR : -EISDIR;
+            }
+            if (dstInode->type == kInodeTypeDir) {
+                const DirNode* const childDir = mDirs.Find(dstInode->fid);
+                if (! childDir) {
+                    return -ENOENT;
+                }
+                if (childDir->GetChildCount() != 0) {
+                    return -ENOTEMPTY;
+                }
+            }
+        } else if (phaseDstExistsFlag) {
+            continue;
+        }
+
+        const TxnId opTxnId = txnId == kNoTxn ? AllocateTxnId() : txnId;
+        if (dstEntry) {
+            fid_t deletedFid = -1;
+            status = dstDir->DeletePending(dstName, opTxnId, &deletedFid);
+            if (status != 0) {
+                return status;
+            }
+            if (! dstInode || deletedFid != dstInode->fid ||
+                    ! mInodes.MarkDeleted(deletedFid, opTxnId)) {
+                return -EIO;
+            }
+        }
+        fid_t movedFid = -1;
+        status = srcDir->DeletePending(oldName, opTxnId, &movedFid);
+        if (status != 0) {
+            return status;
+        }
+        if (movedFid != srcInode->fid) {
+            return -EIO;
+        }
+        status = dstDir->InsertPending(dstName, srcInode->fid, opTxnId,
+            committedTxn, dstEntry != 0);
+        if (status != 0) {
+            return status;
+        }
+        if (parentFid != dstParentFid &&
+                ! mInodes.Move(srcInode->fid, dstParentFid)) {
+            return -EIO;
+        }
+        if (txnId != kNoTxn) {
+            AdvanceSeeds(-1, opTxnId);
+        }
+        if (txnIdPtr) {
+            *txnIdPtr = opTxnId;
+        }
+        return 0;
+    }
+    return -EAGAIN;
+}
+
+    int
+NamespaceStore::RemoveSelf(
+    fid_t              parentFid,
+    const std::string& name,
+    InodeType          type,
+    bool               requireEmptyFlag,
+    TxnId              txnId,
+    TxnId*             txnIdPtr)
+{
+    if (txnIdPtr) {
+        *txnIdPtr = kNoTxn;
+    }
+    if (! IsLegalName(name)) {
+        return -EINVAL;
+    }
+
+    fid_t childFid = -1;
+    {
+        std::vector<QCMutex*> parentLocks;
+        AddDirShardMutex(parentLocks, parentFid);
+        AddInodeShardMutex(parentLocks, parentFid);
+        ScopedMutexGroup parentLocker(parentLocks);
+        const TxnId committedTxn = GetCommittedTxnSnapshot();
+        if (txnId != kNoTxn && txnId <= committedTxn) {
+            return -EINVAL;
+        }
+        if (! FindCommittedDir(parentFid, committedTxn)) {
+            return -ENOENT;
+        }
+        DirNode* const dir = mDirs.Find(parentFid);
+        if (! dir) {
+            return -ENOENT;
+        }
+        const VersionedDirEntry* const committedEntry =
+            dir->LookupCommitted(name, committedTxn);
+        if (! committedEntry) {
+            return -ENOENT;
+        }
+        childFid = committedEntry->childFid;
+    }
+
+    std::vector<QCMutex*> locks;
+    AddDirShardMutex(locks, parentFid);
+    AddInodeShardMutex(locks, parentFid);
+    AddInodeShardMutex(locks, childFid);
+    if (requireEmptyFlag) {
+        AddDirShardMutex(locks, childFid);
+    }
+    ScopedMutexGroup locker(locks);
+    const TxnId committedTxn = GetCommittedTxnSnapshot();
+    if (txnId != kNoTxn && txnId <= committedTxn) {
+        return -EINVAL;
+    }
+    if (! FindCommittedDir(parentFid, committedTxn)) {
+        return -ENOENT;
+    }
+    DirNode* const dir = mDirs.Find(parentFid);
+    if (! dir) {
+        return -ENOENT;
+    }
+    const VersionedDirEntry* const committedEntry =
+        dir->LookupCommitted(name, committedTxn);
+    if (! committedEntry || committedEntry->childFid != childFid) {
+        return -ENOENT;
+    }
+    const InodeRecord* const inode =
+        mInodes.FindCommitted(childFid, committedTxn);
+    if (! inode) {
+        return -ENOENT;
+    }
+    if (inode->type != type) {
+        return type == kInodeTypeDir ? -ENOTDIR : -EISDIR;
+    }
+    if (requireEmptyFlag) {
+        const DirNode* const childDir = mDirs.Find(inode->fid);
+        if (! childDir) {
+            return -ENOENT;
+        }
+        if (childDir->GetChildCount() != 0) {
+            return -ENOTEMPTY;
+        }
+    }
+    const TxnId opTxnId = txnId == kNoTxn ? AllocateTxnId() : txnId;
+    fid_t deletedFid = -1;
+    const int status = dir->DeletePending(name, opTxnId, &deletedFid);
+    if (status != 0) {
+        return status;
+    }
+    if (deletedFid != childFid || ! mInodes.MarkDeleted(childFid, opTxnId)) {
+        return -EIO;
+    }
+    AdvanceSeeds(-1, opTxnId);
+    if (txnIdPtr) {
+        *txnIdPtr = opTxnId;
+    }
+    return 0;
+}
+
+    int
+NamespaceStore::ApplyCreate(
+    fid_t              parentFid,
+    const std::string& name,
+    InodeType          type,
+    fid_t              childFid,
+    TxnId              txnId,
+    kfsUid_t           user,
+    kfsGid_t           group,
+    kfsMode_t          mode,
+    int16_t            numReplicas,
+    int64_t            mtime,
+    bool               commitFlag,
+    bool               advanceSeedsFlag)
+{
+    if (parentFid < 0 || childFid < 0 || txnId == kNoTxn) {
+        return -EINVAL;
+    }
+    int status = 0;
+    {
+        ScopedSmallMutexGroup locker(
+            &GetDirShardMutex(parentFid),
+            &GetInodeShardMutex(parentFid),
+            &GetInodeShardMutex(childFid),
+            type == kInodeTypeDir ? &GetDirShardMutex(childFid) : 0);
+        const TxnId committedTxn = commitFlag ?
+            GetCommittedTxnSnapshot() : txnId - 1;
+        status = CreateSelf(parentFid, NameKey(name), type, childFid, txnId, 0,
+            user, group, mode, numReplicas, mtime, committedTxn);
+    }
+    if (advanceSeedsFlag) {
+        AdvanceSeeds(childFid, txnId);
+    }
+    if (commitFlag) {
+        CommitThrough(txnId);
+    }
+    return status;
+}
+
+    int
+NamespaceStore::ApplyCreateTrusted(
+    fid_t              parentFid,
+    const std::string& name,
+    InodeType          type,
+    fid_t              childFid,
+    TxnId              txnId,
+    kfsUid_t           user,
+    kfsGid_t           group,
+    kfsMode_t          mode,
+    int16_t            numReplicas,
+    int64_t            mtime,
+    bool               commitFlag,
+    bool               advanceSeedsFlag)
+{
+    if (parentFid < 0 || childFid < 0 || txnId == kNoTxn) {
+        return -EINVAL;
+    }
+    int status = 0;
+    {
+        ScopedSmallMutexGroup locker(
+            &GetDirShardMutex(parentFid),
+            &GetInodeShardMutex(parentFid),
+            &GetInodeShardMutex(childFid),
+            type == kInodeTypeDir ? &GetDirShardMutex(childFid) : 0);
+        const TxnId committedTxn = commitFlag ?
+            GetCommittedTxnSnapshot() : txnId - 1;
+        status = CreateSelfTrusted(parentFid, NameKey(name), type,
+            childFid, txnId, 0, user, group, mode, numReplicas, mtime,
+            committedTxn);
+    }
+    if (advanceSeedsFlag) {
+        AdvanceSeeds(childFid, txnId);
+    }
+    if (commitFlag) {
+        CommitThrough(txnId);
+    }
+    return status;
+}
+
+
+    int
+NamespaceStore::ApplyEditLog(
+    const EditLogRecord& record,
+    bool                 commitFlag)
+{
+    const int validStatus = ValidateEditLogRecord(record);
+    if (validStatus != 0) {
+        return validStatus;
+    }
+    if (record.txnId <= GetCommittedTxnSnapshot()) {
+        return -EINVAL;
+    }
+    int status = 0;
+    if (record.type == EditLogRecord::kCreate) {
+        return ApplyCreate(record.parentFid, record.name, record.inodeType,
+            record.fid, record.txnId, record.user, record.group,
+            record.mode, record.numReplicas, record.mtime, commitFlag, true);
+    }
+    if (record.type == EditLogRecord::kRemove) {
+        TxnId txnId = kNoTxn;
+        status = RemoveSelf(record.parentFid, record.name, record.inodeType,
+            record.inodeType == kInodeTypeDir, record.txnId, &txnId);
+        if (status == 0) {
+            if (txnId != record.txnId) {
+                return -EINVAL;
+            }
+            CommitThrough(record.txnId);
+        }
+        return status;
+    }
+    if (record.type == EditLogRecord::kRename) {
+        LookupResult lookup;
+        status = Lookup(record.parentFid, record.name, lookup);
+        if (status != 0) {
+            return status;
+        }
+        if (lookup.fid != record.fid) {
+            return -EINVAL;
+        }
+        TxnId txnId = kNoTxn;
+        fid_t srcFid = -1;
+        status = RenameSelf(record.parentFid, record.name, record.newPath,
+            record.overwriteFlag, record.txnId, &txnId, &srcFid);
+        if (status == 0) {
+            if (txnId != record.txnId || srcFid != record.fid) {
+                return -EINVAL;
+            }
+            CommitThrough(record.txnId);
+        }
+        return status;
+    }
+    return -EINVAL;
+}
+
+    int
+NamespaceStore::ApplyEditLog(
+    std::istream& is)
+{
+    std::string line;
+    while (std::getline(is, line)) {
+        if (line.empty()) {
+            continue;
+        }
+        EditLogRecord record;
+        int status = ReadEditLog(line, record);
+        if (status != 0) {
+            return status;
+        }
+        status = ApplyEditLog(record);
+        if (status != 0) {
+            return status;
+        }
+    }
+    return is.bad() ? -EIO : 0;
+}
+
+    int
+NamespaceStore::SaveCheckpoint(
+    std::ostream& os) const
+{
+    if (! os.good()) {
+        return -EIO;
+    }
+    std::vector<QCMutex*> locks;
+    AddAllDirShardMutexes(locks);
+    AddAllInodeShardMutexes(locks);
+    ScopedMutexGroup locker(locks);
+    fid_t nextFid = -1;
+    TxnId nextTxn = kNoTxn;
+    TxnId committedTxn = kNoTxn;
+    {
+        ScopedMutex txnLocker(GetTxnMutex());
+        nextFid = mNextFid;
+        nextTxn = mNextTxn;
+        committedTxn = mCommittedTxn.load(std::memory_order_relaxed);
+    }
+    os << "namespacev2_checkpoint 1\n" <<
+        "state " << mRootFid << " " << nextFid << " " <<
+            nextTxn << " " << committedTxn << " " <<
+            mConfig.dirLargeThreshold << "\n";
+
+    std::vector<InodeRecord> inodes;
+    mInodes.GetCommitted(committedTxn, inodes);
+    for (std::vector<InodeRecord>::const_iterator it = inodes.begin();
+            it != inodes.end();
+            ++it) {
+        const InodeRecord& inode = *it;
+        os << "inode " << inode.fid << " " << inode.parentFid << " " <<
+            InodeTypeToInt(inode.type) << " " << inode.generation << " " <<
+            inode.user << " " << inode.group << " " << inode.mode << " " <<
+            inode.numReplicas << " " << inode.mtime << " " <<
+            inode.ctime << " " << inode.atime << "\n";
+    }
+
+    std::vector<std::pair<fid_t, uint64_t> > dirGenerations;
+    mDirs.GetDirGenerations(dirGenerations);
+    for (std::vector<std::pair<fid_t, uint64_t> >::const_iterator it =
+                dirGenerations.begin();
+            it != dirGenerations.end();
+            ++it) {
+        if (mInodes.FindCommitted(it->first, committedTxn)) {
+            os << "dirgen " << it->first << " " << it->second << "\n";
+        }
+    }
+
+    std::vector<CheckpointDirEntry> entries;
+    mDirs.GetCommittedEntries(committedTxn, entries);
+    for (std::vector<CheckpointDirEntry>::const_iterator it = entries.begin();
+            it != entries.end();
+            ++it) {
+        if (! mInodes.FindCommitted(it->parentFid, committedTxn) ||
+                ! mInodes.FindCommitted(it->childFid, committedTxn)) {
+            continue;
+        }
+        os << "dentry " << it->parentFid << " " << it->childFid << " " <<
+            EncodeName(it->key.name) << "\n";
+    }
+    os << "end\n";
+    return os.good() ? 0 : -EIO;
+}
+
+    int
+NamespaceStore::SaveCheckpointDiskEntry(
+    std::ostream& os) const
+{
+    if (! os.good()) {
+        return -EIO;
+    }
+    std::vector<QCMutex*> locks;
+    AddAllDirShardMutexes(locks);
+    AddAllInodeShardMutexes(locks);
+    ScopedMutexGroup locker(locks);
+    fid_t nextFid = -1;
+    TxnId nextTxn = kNoTxn;
+    TxnId committedTxn = kNoTxn;
+    {
+        ScopedMutex txnLocker(GetTxnMutex());
+        nextFid = mNextFid;
+        nextTxn = mNextTxn;
+        committedTxn = mCommittedTxn.load(std::memory_order_relaxed);
+    }
+    os << "nv2/state/" << mRootFid << "/" << nextFid << "/" <<
+        nextTxn << "/" << committedTxn << "/" <<
+        mConfig.dirLargeThreshold << "\n";
+
+    std::vector<InodeRecord> inodes;
+    mInodes.GetCommitted(committedTxn, inodes);
+    for (std::vector<InodeRecord>::const_iterator it = inodes.begin();
+            it != inodes.end();
+            ++it) {
+        const InodeRecord& inode = *it;
+        os << "nv2/inode/" << inode.fid << "/" << inode.parentFid <<
+            "/" << InodeTypeToInt(inode.type) << "/" <<
+            inode.generation << "/" << inode.user << "/" <<
+            inode.group << "/" << inode.mode << "/" <<
+            inode.numReplicas << "/" << inode.mtime << "/" <<
+            inode.ctime << "/" << inode.atime << "\n";
+    }
+
+    std::vector<std::pair<fid_t, uint64_t> > dirGenerations;
+    mDirs.GetDirGenerations(dirGenerations);
+    for (std::vector<std::pair<fid_t, uint64_t> >::const_iterator it =
+                dirGenerations.begin();
+            it != dirGenerations.end();
+            ++it) {
+        if (mInodes.FindCommitted(it->first, committedTxn)) {
+            os << "nv2/dirgen/" << it->first << "/" <<
+                it->second << "\n";
+        }
+    }
+
+    std::vector<CheckpointDirEntry> entries;
+    mDirs.GetCommittedEntries(committedTxn, entries);
+    for (std::vector<CheckpointDirEntry>::const_iterator it = entries.begin();
+            it != entries.end();
+            ++it) {
+        if (! mInodes.FindCommitted(it->parentFid, committedTxn) ||
+                ! mInodes.FindCommitted(it->childFid, committedTxn)) {
+            continue;
+        }
+        os << "nv2/dentry/" << it->parentFid << "/" <<
+            it->childFid << "/" << EncodeName(it->key.name) << "\n";
+    }
+    os << "nv2/end\n";
+    return os.good() ? 0 : -EIO;
+}
+
+    int
+NamespaceStore::LoadCheckpoint(
+    std::istream& is)
+{
+    std::string magic;
+    int version = 0;
+    if (! (is >> magic >> version) || magic != "namespacev2_checkpoint" ||
+            version != 1) {
+        return -EINVAL;
+    }
+    std::string stateTag;
+    fid_t rootFid = -1;
+    fid_t nextFid = -1;
+    TxnId nextTxn = 0;
+    TxnId committedTxn = 0;
+    int largeThreshold = 0;
+    if (! (is >> stateTag >> rootFid >> nextFid >> nextTxn >>
+            committedTxn >> largeThreshold) || stateTag != "state" ||
+            rootFid < 0 || nextFid <= rootFid || largeThreshold <= 0 ||
+            committedTxn > nextTxn) {
+        return -EINVAL;
+    }
+
+    NamespaceStore tmp(mConfig, rootFid);
+    tmp.mConfig.dirLargeThreshold = largeThreshold;
+    tmp.mRootFid      = rootFid;
+    tmp.mNextFid      = nextFid;
+    tmp.mNextTxn      = nextTxn;
+    tmp.mCommittedTxn.store(committedTxn, std::memory_order_relaxed);
+    tmp.mInodes       = InodeTable();
+    tmp.mDirs         = DirTable();
+
+    std::vector<CheckpointDirEntry> dentries;
+    std::vector<std::pair<fid_t, uint64_t> > dirGenerations;
+    std::string tag;
+    bool endFlag = false;
+    while (is >> tag) {
+        if (tag == "end") {
+            endFlag = true;
+            break;
+        }
+        if (tag == "inode") {
+            fid_t fid = -1;
+            fid_t parentFid = -1;
+            int typeValue = -1;
+            uint64_t generation = 0;
+            kfsUid_t user = kKfsUserRoot;
+            kfsGid_t group = kKfsGroupRoot;
+            kfsMode_t mode = 0;
+            int16_t numReplicas = 1;
+            int64_t mtime = 0;
+            int64_t ctime = 0;
+            int64_t atime = 0;
+            InodeType type = kInodeTypeFile;
+            if (! (is >> fid >> parentFid >> typeValue >> generation >>
+                    user >> group >> mode >> numReplicas >> mtime >>
+                    ctime >> atime) || fid < 0 || parentFid < 0 ||
+                    ! IntToInodeType(typeValue, type)) {
+                return -EINVAL;
+            }
+            InodeRecord inode(fid, parentFid, type, kNoTxn,
+                user, group, mode, numReplicas, mtime);
+            inode.generation  = generation;
+            inode.pendingFlag = false;
+            inode.ctime       = ctime;
+            inode.atime       = atime;
+            if (! tmp.mInodes.Insert(inode)) {
+                return -EINVAL;
+            }
+            if (type == kInodeTypeDir && ! tmp.mDirs.Insert(
+                    fid, DirNode(tmp.mConfig.dirLargeThreshold,
+                        tmp.mConfig.dirPromoteMaxWallMs))) {
+                return -EINVAL;
+            }
+        } else if (tag == "dirgen") {
+            fid_t dirFid = -1;
+            uint64_t generation = 0;
+            if (! (is >> dirFid >> generation) || dirFid < 0) {
+                return -EINVAL;
+            }
+            dirGenerations.push_back(std::make_pair(dirFid, generation));
+        } else if (tag == "dentry") {
+            fid_t parentFid = -1;
+            fid_t childFid = -1;
+            std::string encodedName;
+            std::string name;
+            if (! (is >> parentFid >> childFid >> encodedName) ||
+                    ! DecodeName(encodedName, name)) {
+                return -EINVAL;
+            }
+            dentries.push_back(CheckpointDirEntry(
+                parentFid, NameKey(name), childFid));
+        } else {
+            return -EINVAL;
+        }
+    }
+    if (! endFlag) {
+        return -EINVAL;
+    }
+    const InodeRecord* const root =
+        tmp.mInodes.FindCommitted(rootFid, committedTxn);
+    if (! root || root->type != kInodeTypeDir || ! tmp.mDirs.Find(rootFid)) {
+        return -EINVAL;
+    }
+    for (std::vector<CheckpointDirEntry>::const_iterator it = dentries.begin();
+            it != dentries.end();
+            ++it) {
+        const InodeRecord* const parent =
+            tmp.mInodes.FindCommitted(it->parentFid, committedTxn);
+        const InodeRecord* const child =
+            tmp.mInodes.FindCommitted(it->childFid, committedTxn);
+        DirNode* const dir = tmp.mDirs.Find(it->parentFid);
+        if (! parent || parent->type != kInodeTypeDir || ! child || ! dir) {
+            return -EINVAL;
+        }
+        const int status = dir->InsertCommitted(it->key.name, it->childFid);
+        if (status != 0) {
+            return status;
+        }
+    }
+    for (std::vector<std::pair<fid_t, uint64_t> >::const_iterator it =
+                dirGenerations.begin();
+            it != dirGenerations.end();
+            ++it) {
+        DirNode* const dir = tmp.mDirs.Find(it->first);
+        if (! dir) {
+            return -EINVAL;
+        }
+        dir->SetGeneration(it->second);
+    }
+    {
+        std::vector<QCMutex*> locks;
+        AddAllDirShardMutexes(locks);
+        AddAllInodeShardMutexes(locks);
+        ScopedMutexGroup locker(locks);
+        ScopedMutex txnLocker(GetTxnMutex());
+        mConfig = tmp.mConfig;
+        mRootFid = tmp.mRootFid;
+        mNextFid = tmp.mNextFid;
+        mNextTxn = tmp.mNextTxn;
+        mCommittedTxn.store(
+            tmp.mCommittedTxn.load(std::memory_order_relaxed),
+            std::memory_order_release);
+        mPendingCommittedTxns = tmp.mPendingCommittedTxns;
+        mInodes = tmp.mInodes;
+        mDirs = tmp.mDirs;
+    }
+    return 0;
+}
+
+    void
+NamespaceStore::CommitThrough(
+    TxnId committedTxn)
+{
+    ScopedMutex locker(GetTxnMutex());
+    TxnId current = mCommittedTxn.load(std::memory_order_relaxed);
+    if (committedTxn <= current) {
+        return;
+    }
+    mPendingCommittedTxns.insert(committedTxn);
+    for (;;) {
+        std::set<TxnId>::iterator const it =
+            mPendingCommittedTxns.find(current + 1);
+        if (it == mPendingCommittedTxns.end()) {
+            break;
+        }
+        mPendingCommittedTxns.erase(it);
+        ++current;
+    }
+    mCommittedTxn.store(current, std::memory_order_release);
+}
+
+    void
+NamespaceStore::CommitThroughRange(
+    TxnId firstTxn,
+    TxnId lastTxn)
+{
+    if (lastTxn < firstTxn) {
+        return;
+    }
+    ScopedMutex locker(GetTxnMutex());
+    TxnId current = mCommittedTxn.load(std::memory_order_relaxed);
+    if (lastTxn <= current) {
+        return;
+    }
+    if (firstTxn <= current + 1) {
+        current = lastTxn;
+    } else {
+        for (TxnId txnId = firstTxn; txnId <= lastTxn; ++txnId) {
+            mPendingCommittedTxns.insert(txnId);
+            if (txnId == lastTxn) {
+                break;
+            }
+        }
+    }
+    for (;;) {
+        std::set<TxnId>::iterator const it =
+            mPendingCommittedTxns.find(current + 1);
+        if (it == mPendingCommittedTxns.end()) {
+            break;
+        }
+        mPendingCommittedTxns.erase(it);
+        ++current;
+    }
+    mCommittedTxn.store(current, std::memory_order_release);
+}
+
+    const InodeRecord*
+NamespaceStore::FindCommittedDir(
+    fid_t dirFid,
+    TxnId committedTxn) const
+{
+    const InodeRecord* const inode =
+        mInodes.FindCommitted(dirFid, committedTxn);
+    return inode && inode->type == kInodeTypeDir ? inode : 0;
+}
+
+    int
+NamespaceStore::ResolveCreateParentDir(
+    fid_t   parentFid,
+    TxnId   committedTxn,
+    DirNode*& dirPtr)
+{
+    dirPtr = 0;
+    if (! FindCommittedDir(parentFid, committedTxn)) {
+        return -ENOENT;
+    }
+    dirPtr = mDirs.Find(parentFid);
+    if (! dirPtr) {
+        return -ENOENT;
+    }
+    return 0;
+}
+
+    int
+NamespaceStore::CheckCreateParentName(
+    fid_t              parentFid,
+    const NameKey&     key,
+    TxnId              committedTxn)
+{
+    DirNode* dir = 0;
+    const int status = ResolveCreateParentDir(parentFid, committedTxn, dir);
+    if (status != 0) {
+        return status;
+    }
+    if (dir->HasVisibleOrPendingName(key, committedTxn)) {
+        return -EEXIST;
+    }
+    return 0;
+}
+
+    int
+NamespaceStore::FillLookupResult(
+    const InodeRecord& inode,
+    uint64_t           parentGeneration,
+    LookupResult&      result) const
+{
+    result.fid              = inode.fid;
+    result.type             = inode.type;
+    result.parentGeneration = parentGeneration;
+    result.user             = inode.user;
+    result.group            = inode.group;
+    result.mode             = inode.mode;
+    result.numReplicas      = inode.numReplicas;
+    result.mtime            = inode.mtime;
+    result.ctime            = inode.ctime;
+    result.atime            = inode.atime;
+    if (inode.type == kInodeTypeDir) {
+        const DirNode* const dir = mDirs.Find(inode.fid);
+        result.fileCount = dir ? (int64_t)dir->GetChildCount() : 0;
+        result.dirCount = 0;
+    }
+    return 0;
+}
+
+ResourceLockKey::ResourceLockKey(
+    Class    inResourceClass,
+    uint64_t inMajor,
+    uint64_t inMinor)
+    : resourceClass(inResourceClass),
+      major(inMajor),
+      minor(inMinor)
+    {}
+
+    bool
+ResourceLockKey::operator<(
+    const ResourceLockKey& other) const
+{
+    if (resourceClass != other.resourceClass) {
+        return resourceClass < other.resourceClass;
+    }
+    if (major != other.major) {
+        return major < other.major;
+    }
+    return minor < other.minor;
+}
+
+} // namespace NamespaceV2
+} // namespace KFS
diff --git a/src/cc/meta/NamespaceV2.h b/src/cc/meta/NamespaceV2.h
new file mode 100644
index 000000000..5e79c1029
--- /dev/null
+++ b/src/cc/meta/NamespaceV2.h
@@ -0,0 +1,457 @@
+//---------------------------------------------------------- -*- Mode: C++ -*-
+// $Id$
+//
+// Memory-native namespace scaffolding for RFC-0001.
+//
+// Copyright 2026 Quantcast Corporation. All rights reserved.
+//
+// This file is part of Kosmos File System (KFS).
+//
+// Licensed under the Apache License, Version 2.0.
+//
+//----------------------------------------------------------------------------
+
+#ifndef META_NAMESPACE_V2_H
+#define META_NAMESPACE_V2_H
+
+#include "common/kfsdecls.h"
+
+#include <stdint.h>
+
+#include <atomic>
+#include <cstddef>
+#include <iosfwd>
+
+#include <map>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <utility>
+
+class QCMutex;
+
+namespace KFS
+{
+
+class Properties;
+
+namespace NamespaceV2
+{
+
+typedef uint64_t TxnId;
+
+enum InodeType
+{
+    kInodeTypeFile,
+    kInodeTypeDir,
+    kInodeTypeSymlink
+};
+
+struct Config
+{
+    bool enabledFlag;
+    bool rpcEnabledFlag;
+    int  dirLargeThreshold;
+    int  dirPromoteMaxWallMs;
+    int  dirShardCount;
+
+    Config();
+    static Config FromProperties(const Properties& props);
+};
+
+void SetParameters(const Properties& props);
+const Config& GetConfig();
+
+struct NameKey
+{
+    uint64_t    hash;
+    std::string name;
+
+    NameKey();
+    explicit NameKey(const std::string& name);
+    NameKey(uint64_t hash, const std::string& name);
+
+    bool operator<(const NameKey& other) const;
+    bool operator==(const NameKey& other) const;
+};
+
+struct NameKeyHash
+{
+    size_t operator()(const NameKey& key) const;
+};
+
+struct VersionedDirEntry
+{
+    fid_t  childFid;
+    TxnId  createTxn;
+    TxnId  deleteTxn;
+    bool   pendingFlag;
+
+    VersionedDirEntry();
+    VersionedDirEntry(fid_t childFid, TxnId createTxn);
+
+    bool IsVisible(TxnId committedTxn) const;
+};
+
+struct InodeRecord
+{
+    fid_t     fid;
+    fid_t     parentFid;
+    InodeType type;
+    TxnId     createTxn;
+    TxnId     deleteTxn;
+    bool      pendingFlag;
+    uint64_t  generation;
+    kfsUid_t  user;
+    kfsGid_t  group;
+    kfsMode_t mode;
+    int16_t   numReplicas;
+    int64_t   mtime;
+    int64_t   ctime;
+    int64_t   atime;
+
+    InodeRecord();
+    InodeRecord(fid_t fid, fid_t parentFid, InodeType type, TxnId createTxn,
+        kfsUid_t user = kKfsUserRoot, kfsGid_t group = kKfsGroupRoot,
+        kfsMode_t mode = 0, int16_t numReplicas = 1, int64_t mtime = 0);
+
+    bool IsVisible(TxnId committedTxn) const;
+};
+
+class InodeTable
+{
+public:
+    InodeTable();
+    bool Insert(const InodeRecord& record);
+    InodeRecord* Find(fid_t fid);
+    const InodeRecord* Find(fid_t fid) const;
+    const InodeRecord* FindCommitted(fid_t fid, TxnId committedTxn) const;
+    bool MarkDeleted(fid_t fid, TxnId deleteTxn);
+    bool Move(fid_t fid, fid_t parentFid);
+    void GetCommitted(TxnId committedTxn,
+        std::vector<InodeRecord>& records) const;
+    void CommitThrough(TxnId committedTxn);
+    size_t Size() const;
+private:
+    typedef std::unordered_map<fid_t, InodeRecord> Table;
+    typedef std::vector<Table> Tables;
+    Tables mTables;
+};
+
+enum DirState
+{
+    kDirStateSmall,
+    kDirStatePromoting,
+    kDirStateLarge
+};
+
+struct ReaddirCookie
+{
+    uint64_t generation;
+    DirState layout;
+    bool     hasLastKeyFlag;
+    NameKey  lastKey;
+
+    ReaddirCookie();
+};
+
+struct ReaddirResult
+{
+    struct Entry
+    {
+        NameKey key;
+        fid_t   childFid;
+
+        Entry(const NameKey& key, fid_t childFid);
+    };
+
+    std::vector<Entry> entries;
+    bool               moreEntriesFlag;
+    ReaddirCookie      nextCookie;
+
+    ReaddirResult();
+};
+
+struct CheckpointDirEntry
+{
+    fid_t   parentFid;
+    NameKey key;
+    fid_t   childFid;
+
+    CheckpointDirEntry(fid_t parentFid, const NameKey& key,
+        fid_t childFid);
+};
+
+class DirNode
+{
+public:
+    explicit DirNode(
+        int largeThreshold = Config().dirLargeThreshold,
+        int promoteMaxWallMs = Config().dirPromoteMaxWallMs);
+
+    DirState GetState() const;
+    uint64_t GetGeneration() const;
+    void SetGeneration(uint64_t generation);
+    size_t GetChildCount() const;
+    bool IsLarge() const;
+
+    bool HasVisibleOrPendingName(const NameKey& key,
+        TxnId committedTxn) const;
+    bool HasVisibleOrPendingName(const std::string& name,
+        TxnId committedTxn) const;
+    int InsertPending(const NameKey& key, fid_t childFid, TxnId txnId,
+        TxnId committedTxn = 0, bool replaceDeletedFlag = false);
+    int InsertPending(const std::string& name, fid_t childFid, TxnId txnId,
+        TxnId committedTxn = 0, bool replaceDeletedFlag = false);
+    int InsertCommitted(const std::string& name, fid_t childFid);
+    void GetCommittedEntries(TxnId committedTxn,
+        std::vector<ReaddirResult::Entry>& entries) const;
+    int DeletePending(const std::string& name, TxnId txnId,
+        fid_t* childFidPtr = 0);
+    const VersionedDirEntry* LookupCommitted(
+        const std::string& name, TxnId committedTxn) const;
+    int ReaddirCommitted(TxnId committedTxn, const ReaddirCookie* cookiePtr,
+        size_t maxEntries, ReaddirResult& result) const;
+    void CommitThrough(TxnId committedTxn);
+
+private:
+    typedef std::unordered_map<NameKey, VersionedDirEntry, NameKeyHash>
+        SmallEntries;
+    typedef std::unordered_map<NameKey, VersionedDirEntry, NameKeyHash>
+        LargeEntries;
+
+    DirState GetCookieLayout() const;
+    int Promote();
+    VersionedDirEntry* FindMutable(const NameKey& key);
+    const VersionedDirEntry* Find(const NameKey& key) const;
+    void IncrementSmallGeneration();
+
+    DirState     mState;
+    uint64_t     mGeneration;
+    int          mLargeThreshold;
+    int          mPromoteMaxWallMs;
+    size_t       mChildCount;
+    SmallEntries mSmall;
+    LargeEntries mLarge;
+};
+
+class DirTable
+{
+public:
+    DirTable();
+    bool Insert(fid_t dirFid, const DirNode& dir);
+    DirNode* Find(fid_t dirFid);
+    const DirNode* Find(fid_t dirFid) const;
+    void CommitThrough(TxnId committedTxn);
+    size_t Size() const;
+    void GetCommittedEntries(TxnId committedTxn,
+        std::vector<CheckpointDirEntry>& entries) const;
+    void GetDirGenerations(
+        std::vector<std::pair<fid_t, uint64_t> >& generations) const;
+
+private:
+    typedef std::unordered_map<fid_t, DirNode> Table;
+    typedef std::vector<Table> Tables;
+    Tables mTables;
+};
+
+struct LookupResult
+{
+    fid_t     fid;
+    InodeType type;
+    uint64_t  parentGeneration;
+    kfsUid_t  user;
+    kfsGid_t  group;
+    kfsMode_t mode;
+    int16_t   numReplicas;
+    int64_t   mtime;
+    int64_t   ctime;
+    int64_t   atime;
+    int64_t   fileCount;
+    int64_t   dirCount;
+
+    LookupResult();
+};
+
+struct CreateResult
+{
+    fid_t    fid;
+    TxnId    txnId;
+    uint64_t parentGeneration;
+
+    CreateResult();
+};
+
+struct EditLogRecord
+{
+    enum Type
+    {
+        kInvalid,
+        kCreate,
+        kRemove,
+        kRename
+    };
+
+    Type        type;
+    TxnId       txnId;
+    fid_t       parentFid;
+    std::string name;
+    fid_t       fid;
+    InodeType   inodeType;
+    kfsUid_t    user;
+    kfsGid_t    group;
+    kfsMode_t   mode;
+    int16_t     numReplicas;
+    int64_t     mtime;
+    std::string newPath;
+    bool        overwriteFlag;
+
+    EditLogRecord();
+};
+
+int WriteEditLog(std::ostream& os, const EditLogRecord& record);
+int ReadEditLog(const std::string& line, EditLogRecord& record);
+
+class NamespaceStore
+{
+public:
+    explicit NamespaceStore(
+        const Config& config = GetConfig(), fid_t rootFid = ROOTFID);
+
+    fid_t GetRootFid() const;
+    TxnId GetCommittedTxn() const;
+    TxnId GetLastTxn() const;
+    size_t GetInodeCount() const;
+    size_t GetDirCount() const;
+    void ReserveCreateIds(fid_t& childFid, TxnId& txnId);
+    void ReserveCreateIdsRange(
+        size_t count, fid_t& firstFid, TxnId& firstTxn);
+
+    int Create(fid_t parentFid, const std::string& name, InodeType type,
+        CreateResult* resultPtr = 0, kfsUid_t user = kKfsUserRoot,
+        kfsGid_t group = kKfsGroupRoot, kfsMode_t mode = 0,
+        int16_t numReplicas = 1, int64_t mtime = 0);
+    // Test/bench helper: ReserveCreateIds + ApplyCreate (pending until CommitThrough).
+    int ApplyCreatePending(fid_t parentFid, const std::string& name,
+        InodeType type, CreateResult* resultPtr = 0,
+        kfsUid_t user = kKfsUserRoot, kfsGid_t group = kKfsGroupRoot,
+        kfsMode_t mode = 0, int16_t numReplicas = 1, int64_t mtime = 0);
+    int Lookup(fid_t parentFid, const std::string& name,
+        LookupResult& result) const;
+    int LookupPath(fid_t rootFid, const std::string& path,
+        LookupResult& result) const;
+    int GetAttr(fid_t fid, LookupResult& result) const;
+    int Readdir(fid_t dirFid, const ReaddirCookie* cookiePtr,
+        size_t maxEntries, ReaddirResult& result) const;
+    int ReaddirFromName(fid_t dirFid, const std::string& name,
+        size_t maxEntries, ReaddirResult& result) const;
+    int Remove(fid_t parentFid, const std::string& name,
+        TxnId* txnIdPtr = 0);
+    int RemoveFile(fid_t parentFid, const std::string& name,
+        TxnId* txnIdPtr = 0);
+    int Rmdir(fid_t parentFid, const std::string& name,
+        TxnId* txnIdPtr = 0);
+    int Rename(fid_t parentFid, const std::string& oldName,
+        const std::string& newPath, bool overwriteFlag,
+        TxnId* txnIdPtr = 0, fid_t* srcFidPtr = 0);
+    int ApplyEditLog(const EditLogRecord& record, bool commitFlag = true);
+    int ApplyEditLog(std::istream& is);
+    int ApplyCreate(fid_t parentFid, const std::string& name, InodeType type,
+        fid_t childFid, TxnId txnId, kfsUid_t user = kKfsUserRoot,
+        kfsGid_t group = kKfsGroupRoot, kfsMode_t mode = 0,
+        int16_t numReplicas = 1, int64_t mtime = 0,
+        bool commitFlag = true, bool advanceSeedsFlag = false);
+    // Trusted apply fast path: assumes WAL / RPC start already validated
+    // parent/name/type semantics. Replay must not use this.
+    int ApplyCreateTrusted(fid_t parentFid, const std::string& name,
+        InodeType type, fid_t childFid, TxnId txnId,
+        kfsUid_t user = kKfsUserRoot, kfsGid_t group = kKfsGroupRoot,
+        kfsMode_t mode = 0, int16_t numReplicas = 1, int64_t mtime = 0,
+        bool commitFlag = true, bool advanceSeedsFlag = false);
+    int SaveCheckpoint(std::ostream& os) const;
+    int SaveCheckpointDiskEntry(std::ostream& os) const;
+    int LoadCheckpoint(std::istream& is);
+    void CommitThrough(TxnId committedTxn);
+    void CommitThroughRange(TxnId firstTxn, TxnId lastTxn);
+
+private:
+    TxnId GetCommittedTxnSnapshot() const;
+    TxnId GetLastTxnSnapshot() const;
+    void AllocateCreateIds(fid_t& childFid, TxnId& txnId);
+    TxnId AllocateTxnId();
+    void AdvanceSeeds(fid_t fid, TxnId txnId);
+    const InodeRecord* FindCommittedDir(
+        fid_t dirFid, TxnId committedTxn) const;
+    int ResolveCreateParentDir(
+        fid_t parentFid,
+        TxnId committedTxn,
+        DirNode*& dirPtr);
+    int CheckCreateParentName(
+        fid_t              parentFid,
+        const NameKey&     key,
+        TxnId              committedTxn);
+    int CreateSelf(fid_t parentFid, const NameKey& key, InodeType type,
+        fid_t childFid, TxnId txnId, CreateResult* resultPtr,
+        kfsUid_t user, kfsGid_t group, kfsMode_t mode,
+        int16_t numReplicas, int64_t mtime,
+        TxnId committedTxn);
+    int CreateSelfTrusted(fid_t parentFid, const NameKey& key, InodeType type,
+        fid_t childFid, TxnId txnId, CreateResult* resultPtr,
+        kfsUid_t user, kfsGid_t group, kfsMode_t mode,
+        int16_t numReplicas, int64_t mtime,
+        TxnId committedTxn);
+    int CreateSelf(fid_t parentFid, const std::string& name, InodeType type,
+        fid_t childFid, TxnId txnId, CreateResult* resultPtr,
+        kfsUid_t user, kfsGid_t group, kfsMode_t mode,
+        int16_t numReplicas, int64_t mtime,
+        TxnId committedTxn);
+    int ResolveRenameTarget(fid_t baseDirFid, const std::string& newPath,
+        fid_t& dstParentFid, std::string& dstName) const;
+    bool IsDescendant(fid_t ancestorFid, fid_t dirFid,
+        TxnId committedTxn) const;
+    int RemoveSelf(fid_t parentFid, const std::string& name,
+        InodeType type, bool requireEmptyFlag, TxnId txnId,
+        TxnId* txnIdPtr);
+    int RenameSelf(fid_t parentFid, const std::string& oldName,
+        const std::string& newPath, bool overwriteFlag, TxnId txnId,
+        TxnId* txnIdPtr, fid_t* srcFidPtr);
+    int FillLookupResult(const InodeRecord& inode,
+        uint64_t parentGeneration, LookupResult& result) const;
+
+    Config     mConfig;
+    fid_t      mRootFid;
+    fid_t      mNextFid;
+    TxnId      mNextTxn;
+    std::atomic<TxnId> mCommittedTxn;
+    std::set<TxnId> mPendingCommittedTxns;
+    InodeTable mInodes;
+    DirTable   mDirs;
+};
+
+NamespaceStore& GetStore();
+
+struct ResourceLockKey
+{
+    enum Class
+    {
+        kSnapshot = 1,
+        kDir      = 2,
+        kInode    = 3,
+        kBlockMap = 4,
+        kEditLog  = 5
+    };
+
+    Class    resourceClass;
+    uint64_t major;
+    uint64_t minor;
+
+    ResourceLockKey(Class resourceClass, uint64_t major, uint64_t minor = 0);
+
+    bool operator<(const ResourceLockKey& other) const;
+};
+
+} // namespace NamespaceV2
+} // namespace KFS
+
+#endif // META_NAMESPACE_V2_H
diff --git a/src/cc/meta/NetDispatch.cc b/src/cc/meta/NetDispatch.cc
index 9d1ffcf6b..14de05230 100644
--- a/src/cc/meta/NetDispatch.cc
+++ b/src/cc/meta/NetDispatch.cc
@@ -1177,6 +1177,50 @@ NetDispatch::Dispatch(MetaRequest *r)
     }
 }
 
+void
+NetDispatch::DispatchBatch(MetaRequest* const* reqs, size_t count)
+{
+    if (! reqs || count == 0) {
+        return;
+    }
+    ClientManager::ClientThread* thread    = 0;
+    bool                         batchFlag = true;
+    for (size_t i = 0; i < count; ++i) {
+        MetaRequest* const r = reqs[i];
+        if (! r || ! r->clnt) {
+            continue;
+        }
+        ClientManager::ClientThread* const reqThread =
+            static_cast<ClientSM*>(r->clnt)->GetClientThread();
+        if (! thread) {
+            thread = reqThread;
+        } else if (thread != reqThread) {
+            batchFlag = false;
+            break;
+        }
+    }
+    if (batchFlag && thread) {
+        for (size_t i = 0; i < count; ++i) {
+            MetaRequest* const r = reqs[i];
+            if (! r) {
+                continue;
+            }
+            sReqStatsGatherer.OpDone(*r);
+            r->submitCount = 0;
+            if (! r->clnt) {
+                MetaRequest::Release(r);
+            }
+        }
+        mClientManager.EnqueueBatch(thread, reqs, count);
+        return;
+    }
+    for (size_t i = 0; i < count; ++i) {
+        if (reqs[i]) {
+            Dispatch(reqs[i]);
+        }
+    }
+}
+
 void NetDispatch::SetMaxClientSockets(int count)
 {
     mClientManager.SetMaxClientSockets(count);
@@ -1627,10 +1671,16 @@ class ClientManager::ClientThread :
         assert(mReqPendingQueue.IsEmpty());
         // Dispatch requests.
         MetaRequest* op;
+        bool         needLogFlushFlag = false;
         while ((op = reqPendingQueue.PopFront())) {
             submit_request(op);
+            if (op->commitPendingFlag) {
+                needLogFlushFlag = true;
+            }
+        }
+        if (needLogFlushFlag) {
+            MetaRequest::GetLogWriter().ScheduleFlush();
         }
-        MetaRequest::GetLogWriter().ScheduleFlush();
         gNetDispatch.ForkDone();
         mPrimaryFlag = gLayoutManager.IsPrimary() &&
             MetaRequest::GetLogWriter().IsPrimary(mNetManager.NowUsec());
@@ -1711,6 +1761,28 @@ class ClientManager::ClientThread :
             mNetManager.Wakeup();
         }
     }
+    void EnqueueBatch(
+        MetaRequest* const* reqs,
+        size_t              count)
+    {
+        if (! reqs || count == 0) {
+            return;
+        }
+        QCStMutexLocker locker(mMutex);
+        const bool wasEmptyFlag = mReqQueue.IsEmpty();
+        for (size_t i = 0; i < count; ++i) {
+            MetaRequest* const op = reqs[i];
+            if (! op || ! op->clnt) {
+                continue;
+            }
+            op->next = 0;
+            mReqQueue.PushBack(*op);
+        }
+        locker.Unlock();
+        if (wasEmptyFlag) {
+            mNetManager.Wakeup();
+        }
+    }
     void Add(NetConnectionPtr& conn)
     {
         if (! conn || ! conn->IsGood() || ! mThread.IsStarted()) {
@@ -1966,6 +2038,37 @@ ClientManager::GetMutex()
     return mImpl.GetMutex();
 }
 
+void
+ClientManager::EnqueueBatch(
+    ClientManager::ClientThread* thread,
+    MetaRequest* const*          reqs,
+    size_t                       count)
+{
+    if (! thread) {
+        for (size_t i = 0; i < count; ++i) {
+            if (reqs[i]) {
+                gNetDispatch.Dispatch(reqs[i]);
+            }
+        }
+        return;
+    }
+    if (! thread->IsStarted()) {
+        for (size_t i = 0; i < count; ++i) {
+            MetaRequest* const op = reqs[i];
+            if (! op || ! op->clnt) {
+                if (op) {
+                    MetaRequest::Release(op);
+                }
+                continue;
+            }
+            op->next = &(*op);
+            op->clnt->HandleEvent(EVENT_CMD_DONE, op);
+        }
+        return;
+    }
+    thread->EnqueueBatch(reqs, count);
+}
+
 /* static */ bool
 ClientManager::EnqueueSelf(ClientManager::ClientThread* thread, MetaRequest& op)
 {
diff --git a/src/cc/meta/NetDispatch.h b/src/cc/meta/NetDispatch.h
index 9fe324064..8f2f87955 100644
--- a/src/cc/meta/NetDispatch.h
+++ b/src/cc/meta/NetDispatch.h
@@ -63,6 +63,9 @@ class NetDispatch
     bool Start(MetaDataSync& metaDataSync);
     //!< Dispatch completed request.
     void Dispatch(MetaRequest* r);
+    //!< Dispatch multiple completed requests with one client-thread wakeup when
+    //!< they share the same ClientThread.
+    void DispatchBatch(MetaRequest* const* reqs, size_t count);
     int SetParameters(const Properties& props);
     void GetStatsCsv(ostream& os);
     void GetStatsCsv(IOBuffer& buf);
diff --git a/src/cc/meta/Replay.cc b/src/cc/meta/Replay.cc
index a22cd1977..2b2e8100a 100644
--- a/src/cc/meta/Replay.cc
+++ b/src/cc/meta/Replay.cc
@@ -34,6 +34,7 @@
 #include "MetaVrSM.h"
 #include "MetaVrOps.h"
 #include "MetaDataStore.h"
+#include "NamespaceV2.h"
 
 #include "common/MdStream.h"
 #include "common/MsgLogger.h"
@@ -44,6 +45,7 @@
 #include "common/StBuffer.h"
 
 #include "kfsio/checksum.h"
+#include "kfsio/Base64.h"
 
 #include "qcdio/QCUtils.h"
 
@@ -939,6 +941,191 @@ replay_mkdir(DETokenizer& c)
     return (ok && 0 == status);
 }
 
+static int
+HexNibble(
+    char ch)
+{
+    if ('0' <= ch && ch <= '9') {
+        return ch - '0';
+    }
+    if ('a' <= ch && ch <= 'f') {
+        return ch - 'a' + 10;
+    }
+    if ('A' <= ch && ch <= 'F') {
+        return ch - 'A' + 10;
+    }
+    return -1;
+}
+
+static bool
+DecodeHexBytes(
+    const string& hex,
+    string&       out)
+{
+    out.clear();
+    if ((hex.size() & 1) != 0) {
+        return false;
+    }
+    out.reserve(hex.size() / 2);
+    for (size_t i = 0; i < hex.size(); i += 2) {
+        const int hi = HexNibble(hex[i]);
+        const int lo = HexNibble(hex[i + 1]);
+        if (hi < 0 || lo < 0) {
+            return false;
+        }
+        out.push_back(char((hi << 4) | lo));
+    }
+    return true;
+}
+
+static bool
+DecodeBase64Bytes(
+    const string& b64,
+    string&       out)
+{
+    StBufferT<char, 64> buf;
+    char* const ptr = buf.Resize(Base64::GetMaxDecodedLength((int)b64.size()));
+    const int len = Base64::Decode(b64.data(), (int)b64.size(), ptr, true);
+    if (len <= 0) {
+        out.clear();
+        return false;
+    }
+    out.assign(ptr, len);
+    return true;
+}
+
+class NamespaceV2WalBatchReader
+{
+public:
+    explicit NamespaceV2WalBatchReader(
+        const string& buf)
+        : mBuf(buf),
+          mPos(0)
+        {}
+
+    template<typename T>
+    bool ReadLe(T& out)
+    {
+        if (mPos + sizeof(T) > mBuf.size()) {
+            return false;
+        }
+        uint64_t v = 0;
+        for (size_t i = 0; i < sizeof(T); i++) {
+            v |= (uint64_t)(unsigned char)mBuf[mPos++] << (i * 8);
+        }
+        out = (T)v;
+        return true;
+    }
+
+    bool ReadBytes(string& out, size_t len)
+    {
+        if (mPos + len > mBuf.size()) {
+            return false;
+        }
+        out.assign(mBuf.data() + mPos, len);
+        mPos += len;
+        return true;
+    }
+
+    bool Done() const { return mPos == mBuf.size(); }
+
+private:
+    const string& mBuf;
+    size_t        mPos;
+};
+
+static bool
+replay_nv2batch(DETokenizer& c)
+{
+    c.pop_front(); // record type
+    int64_t count = 0;
+    string  enc;
+    bool    b64Flag = false;
+    bool ok = pop_num(count, "c", c, true);
+    if (ok) {
+        // New format: /b/<base64>
+        if (pop_name(enc, "b", c, ok)) {
+            b64Flag = true;
+        } else {
+            // Legacy format: /h/<hex>
+            ok = pop_name(enc, "h", c, ok);
+        }
+    }
+    if (! ok || count <= 0) {
+        return false;
+    }
+    string bytes;
+    if (! (b64Flag ? DecodeBase64Bytes(enc, bytes) : DecodeHexBytes(enc, bytes))) {
+        return false;
+    }
+    NamespaceV2WalBatchReader r(bytes);
+    NamespaceV2::NamespaceStore& store = NamespaceV2::GetStore();
+    NamespaceV2::TxnId firstTxn = 0;
+    NamespaceV2::TxnId lastTxn  = 0;
+    for (int64_t i = 0; i < count; i++) {
+        uint8_t  opType = 0;
+        int64_t  parentFid = -1;
+        int64_t  fid = -1;
+        uint64_t txnId = 0;
+        uint32_t user = 0;
+        uint32_t group = 0;
+        uint16_t mode = 0;
+        int16_t  numReplicas = 0;
+        int64_t  mtime = 0;
+        uint16_t nameLen = 0;
+        string   name;
+        if (! r.ReadLe(opType) ||
+                ! r.ReadLe(parentFid) ||
+                ! r.ReadLe(fid) ||
+                ! r.ReadLe(txnId) ||
+                ! r.ReadLe(user) ||
+                ! r.ReadLe(group) ||
+                ! r.ReadLe(mode) ||
+                ! r.ReadLe(numReplicas) ||
+                ! r.ReadLe(mtime) ||
+                ! r.ReadLe(nameLen) ||
+                ! r.ReadBytes(name, nameLen)) {
+            return false;
+        }
+        const NamespaceV2::InodeType type =
+            opType == 2 ? NamespaceV2::kInodeTypeDir :
+            NamespaceV2::kInodeTypeFile;
+        const int status = store.ApplyCreate(
+            (fid_t)parentFid,
+            name,
+            type,
+            (fid_t)fid,
+            (NamespaceV2::TxnId)txnId,
+            (kfsUid_t)user,
+            (kfsGid_t)group,
+            (kfsMode_t)mode,
+            numReplicas,
+            mtime,
+            false, // commitFlag
+            true   // advanceSeedsFlag
+        );
+        if (status != 0) {
+            return false;
+        }
+        if (firstTxn == 0) {
+            firstTxn = (NamespaceV2::TxnId)txnId;
+        }
+        lastTxn = (NamespaceV2::TxnId)txnId;
+    }
+    if (! r.Done()) {
+        return false;
+    }
+    store.CommitThroughRange(firstTxn, lastTxn);
+    return true;
+}
+
+static bool
+replay_nv2batchc(DETokenizer& c)
+{
+    c.pop_front(); // record type
+    return true;
+}
+
 /*!
  * \brief replay remove
  * format: remove/dir/<parentID>/name/<name>
@@ -2158,6 +2345,8 @@ get_entry_map()
     e.add_parser("version",                 &replay_version);
     e.add_parser("create",                  &replay_create);
     e.add_parser("mkdir",                   &replay_mkdir);
+    e.add_parser("nv2batch",                &replay_nv2batch);
+    e.add_parser("nv2batchc",               &replay_nv2batchc);
     e.add_parser("remove",                  &replay_remove);
     e.add_parser("rmdir",                   &replay_rmdir);
     e.add_parser("rename",                  &replay_rename);
diff --git a/src/cc/meta/Replay.h b/src/cc/meta/Replay.h
index de23d1ff1..5eec6ca17 100644
--- a/src/cc/meta/Replay.h
+++ b/src/cc/meta/Replay.h
@@ -127,6 +127,7 @@ class Replay
     bool commitAll();
     bool submit(MetaRequest& req)
         { return (enqueueFlag && enqueue(req)); }
+    bool isSubmitQueueEnabled() const { return enqueueFlag; }
     vrNodeId_t getPrimaryNodeId() const
         { return primaryNodeId; }
     void handle(
diff --git a/src/cc/meta/Restorer.cc b/src/cc/meta/Restorer.cc
index 7569363ee..1277c55f5 100644
--- a/src/cc/meta/Restorer.cc
+++ b/src/cc/meta/Restorer.cc
@@ -36,6 +36,7 @@
 #include "NetDispatch.h"
 #include "LogWriter.h"
 #include "MetaVrSM.h"
+#include "NamespaceV2.h"
 
 #include "common/MdStream.h"
 #include "common/MsgLogger.h"
@@ -47,16 +48,167 @@
 #include <cerrno>
 #include <cstring>
 #include <fstream>
+#include <sstream>
 
 namespace KFS
 {
 using std::cerr;
 using std::string;
 using std::ifstream;
+using std::stringstream;
 
 static int16_t sMinReplicasPerFile     = 0;
 static bool    sHasVrSequenceFlag      = false;
 static bool    sVrSequenceRequiredFlag = false;
+static stringstream sNamespaceV2CheckpointImage;
+static bool         sNamespaceV2CheckpointStartedFlag = false;
+
+static void
+reset_namespace_v2_checkpoint_image()
+{
+    sNamespaceV2CheckpointImage.str(string());
+    sNamespaceV2CheckpointImage.clear();
+    sNamespaceV2CheckpointStartedFlag = false;
+}
+
+static bool
+pop_namespace_v2_number(
+    DETokenizer& c,
+    int64_t&     value)
+{
+    if (c.empty()) {
+        return false;
+    }
+    value = c.toNumber();
+    if (! c.isLastOk()) {
+        return false;
+    }
+    c.pop_front();
+    return true;
+}
+
+static bool
+pop_namespace_v2_token(
+    DETokenizer& c,
+    string&      value)
+{
+    if (c.empty() || c.front().empty()) {
+        return false;
+    }
+    value = c.front();
+    c.pop_front();
+    return true;
+}
+
+static bool
+restore_namespace_v2(DETokenizer& c)
+{
+    c.pop_front();
+    if (c.empty()) {
+        return false;
+    }
+    const DETokenizer::Token tag = c.front();
+    c.pop_front();
+    if (tag == "state") {
+        int64_t rootFid = -1;
+        int64_t nextFid = -1;
+        int64_t nextTxn = -1;
+        int64_t committedTxn = -1;
+        int64_t largeThreshold = -1;
+        if (! pop_namespace_v2_number(c, rootFid) ||
+                ! pop_namespace_v2_number(c, nextFid) ||
+                ! pop_namespace_v2_number(c, nextTxn) ||
+                ! pop_namespace_v2_number(c, committedTxn) ||
+                ! pop_namespace_v2_number(c, largeThreshold) ||
+                ! c.empty()) {
+            return false;
+        }
+        reset_namespace_v2_checkpoint_image();
+        sNamespaceV2CheckpointStartedFlag = true;
+        sNamespaceV2CheckpointImage << "namespacev2_checkpoint 1\n" <<
+            "state " << rootFid << " " << nextFid << " " <<
+            nextTxn << " " << committedTxn << " " <<
+            largeThreshold << "\n";
+        return true;
+    }
+    if (! sNamespaceV2CheckpointStartedFlag) {
+        return false;
+    }
+    if (tag == "inode") {
+        int64_t fid = -1;
+        int64_t parentFid = -1;
+        int64_t type = -1;
+        int64_t generation = -1;
+        int64_t user = -1;
+        int64_t group = -1;
+        int64_t mode = -1;
+        int64_t numReplicas = -1;
+        int64_t mtime = -1;
+        int64_t ctime = -1;
+        int64_t atime = -1;
+        if (! pop_namespace_v2_number(c, fid) ||
+                ! pop_namespace_v2_number(c, parentFid) ||
+                ! pop_namespace_v2_number(c, type) ||
+                ! pop_namespace_v2_number(c, generation) ||
+                ! pop_namespace_v2_number(c, user) ||
+                ! pop_namespace_v2_number(c, group) ||
+                ! pop_namespace_v2_number(c, mode) ||
+                ! pop_namespace_v2_number(c, numReplicas) ||
+                ! pop_namespace_v2_number(c, mtime) ||
+                ! pop_namespace_v2_number(c, ctime) ||
+                ! pop_namespace_v2_number(c, atime) ||
+                ! c.empty()) {
+            return false;
+        }
+        sNamespaceV2CheckpointImage << "inode " << fid << " " <<
+            parentFid << " " << type << " " << generation << " " <<
+            user << " " << group << " " << mode << " " <<
+            numReplicas << " " << mtime << " " << ctime << " " <<
+            atime << "\n";
+        return true;
+    }
+    if (tag == "dirgen") {
+        int64_t dirFid = -1;
+        int64_t generation = -1;
+        if (! pop_namespace_v2_number(c, dirFid) ||
+                ! pop_namespace_v2_number(c, generation) || ! c.empty()) {
+            return false;
+        }
+        sNamespaceV2CheckpointImage << "dirgen " << dirFid << " " <<
+            generation << "\n";
+        return true;
+    }
+    if (tag == "dentry") {
+        int64_t parentFid = -1;
+        int64_t childFid = -1;
+        string encodedName;
+        if (! pop_namespace_v2_number(c, parentFid) ||
+                ! pop_namespace_v2_number(c, childFid) ||
+                ! pop_namespace_v2_token(c, encodedName) || ! c.empty()) {
+            return false;
+        }
+        sNamespaceV2CheckpointImage << "dentry " << parentFid << " " <<
+            childFid << " " << encodedName << "\n";
+        return true;
+    }
+    if (tag != "end" || ! c.empty()) {
+        return false;
+    }
+    sNamespaceV2CheckpointImage << "end\n";
+    sNamespaceV2CheckpointImage.clear();
+    sNamespaceV2CheckpointImage.seekg(0);
+    const int status = NamespaceV2::GetStore().LoadCheckpoint(
+        sNamespaceV2CheckpointImage);
+    if (status != 0) {
+        KFS_LOG_STREAM_ERROR <<
+            "namespace v2 checkpoint restore failure: " << status <<
+        KFS_LOG_EOM;
+        return false;
+    }
+    reset_namespace_v2_checkpoint_image();
+    return true;
+}
+
 
 static bool
 checkpoint_seq(DETokenizer& c)
@@ -933,6 +1085,7 @@ get_entry_map()
     e.add_parser("worm",                    &restore_worm_mode);
     e.add_parser("ckey",                    &restore_crypto_key);
     e.add_parser("shortnames",              &restore_short_names);
+    e.add_parser("nv2",                     &restore_namespace_v2);
     Replay::AddRestotreEntries(e);
     initied = true;
     return e;
@@ -970,6 +1123,7 @@ Restorer::rebuild(const string& cpname, int16_t minReplicas)
     }
     sMinReplicasPerFile     = minReplicas;
     sVrSequenceRequiredFlag = mVrSequenceRequiredFlag;
+    reset_namespace_v2_checkpoint_image();
     ifstream file;
     file.open(cpname.c_str(), ifstream::binary | ifstream::in);
     if (file.fail()) {
@@ -1015,6 +1169,13 @@ Restorer::rebuild(const string& cpname, int16_t minReplicas)
         is_ok = false;
     }
     file.close();
+    if (is_ok && sNamespaceV2CheckpointStartedFlag) {
+        KFS_LOG_STREAM_FATAL <<
+            cpname << ": incomplete namespace v2 checkpoint image" <<
+        KFS_LOG_EOM;
+        is_ok = false;
+    }
+    reset_namespace_v2_checkpoint_image();
     if (is_ok && lastLineChecksumFlag) {
         const string md = mds.GetMd();
         if (restoreChecksum != md) {
diff --git a/src/cc/meta/namespacev2bench_main.cc b/src/cc/meta/namespacev2bench_main.cc
new file mode 100644
index 000000000..4570cb5d8
--- /dev/null
+++ b/src/cc/meta/namespacev2bench_main.cc
@@ -0,0 +1,367 @@
+//---------------------------------------------------------- -*- Mode: C++ -*-
+// $Id$
+//
+// Micro benchmark for RFC-0001 NamespaceV2 scaffolding.
+//
+// Copyright 2026 Quantcast Corporation. All rights reserved.
+//
+// This file is part of Kosmos File System (KFS).
+//
+// Licensed under the Apache License, Version 2.0.
+//
+//----------------------------------------------------------------------------
+
+#include "NamespaceV2.h"
+
+#include <stdint.h>
+#include <sys/time.h>
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+
+using namespace KFS;
+using namespace KFS::NamespaceV2;
+using std::cout;
+using std::string;
+using std::vector;
+
+namespace
+{
+
+struct Options
+{
+    uint64_t entries;
+    uint64_t dirs;
+    uint64_t lookupSamples;
+    size_t   readdirPageSize;
+    int      largeThreshold;
+
+    Options()
+        : entries(1000000),
+          dirs(1),
+          lookupSamples(100000),
+          readdirPageSize(1000),
+          largeThreshold(Config().dirLargeThreshold)
+        {}
+};
+
+    uint64_t
+NowUsec()
+{
+    timeval tv;
+    gettimeofday(&tv, 0);
+    return uint64_t(tv.tv_sec) * 1000000 + tv.tv_usec;
+}
+
+    double
+OpsPerSec(
+    uint64_t count,
+    uint64_t usec)
+{
+    return usec == 0 ? 0 : (double(count) * 1000000.0) / double(usec);
+}
+
+    bool
+ParseUInt64(
+    const char* namePtr,
+    const char* valuePtr,
+    uint64_t&   value)
+{
+    char* endPtr = 0;
+    errno = 0;
+    const unsigned long long parsed = strtoull(valuePtr, &endPtr, 10);
+    if (errno != 0 || ! endPtr || *endPtr != 0) {
+        cout << "invalid " << namePtr << ": " << valuePtr << "\n";
+        return false;
+    }
+    value = uint64_t(parsed);
+    return true;
+}
+
+    bool
+ParseSize(
+    const char* namePtr,
+    const char* valuePtr,
+    size_t&     value)
+{
+    uint64_t parsed = 0;
+    if (! ParseUInt64(namePtr, valuePtr, parsed)) {
+        return false;
+    }
+    value = size_t(parsed);
+    return true;
+}
+
+    bool
+ParseInt(
+    const char* namePtr,
+    const char* valuePtr,
+    int&        value)
+{
+    char* endPtr = 0;
+    errno = 0;
+    const long parsed = strtol(valuePtr, &endPtr, 10);
+    if (errno != 0 || ! endPtr || *endPtr != 0 || parsed <= 0) {
+        cout << "invalid " << namePtr << ": " << valuePtr << "\n";
+        return false;
+    }
+    value = int(parsed);
+    return true;
+}
+
+    void
+Usage(
+    const char* progPtr)
+{
+    cout <<
+        "Usage: " << progPtr << " [options]\n"
+        "  --entries N          number of file creates, default 1000000\n"
+        "  --dirs N             parent directories to spread creates, default 1\n"
+        "  --lookup-samples N   committed lookups to sample, default 100000\n"
+        "  --readdir-page N     committed readdir page size, default 1000\n"
+        "  --threshold N        Small to Large promotion threshold, default 4096\n"
+    ;
+}
+
+    bool
+ParseOptions(
+    int      argc,
+    char**   argv,
+    Options& options)
+{
+    for (int i = 1; i < argc; i++) {
+        const char* const argPtr = argv[i];
+        if (strcmp(argPtr, "--help") == 0 || strcmp(argPtr, "-h") == 0) {
+            Usage(argv[0]);
+            return false;
+        }
+        if (i + 1 >= argc) {
+            cout << "missing value for " << argPtr << "\n";
+            Usage(argv[0]);
+            return false;
+        }
+        const char* const valuePtr = argv[++i];
+        if (strcmp(argPtr, "--entries") == 0) {
+            if (! ParseUInt64(argPtr, valuePtr, options.entries)) {
+                return false;
+            }
+        } else if (strcmp(argPtr, "--dirs") == 0) {
+            if (! ParseUInt64(argPtr, valuePtr, options.dirs)) {
+                return false;
+            }
+        } else if (strcmp(argPtr, "--lookup-samples") == 0) {
+            if (! ParseUInt64(argPtr, valuePtr, options.lookupSamples)) {
+                return false;
+            }
+        } else if (strcmp(argPtr, "--readdir-page") == 0) {
+            if (! ParseSize(argPtr, valuePtr, options.readdirPageSize)) {
+                return false;
+            }
+        } else if (strcmp(argPtr, "--threshold") == 0) {
+            if (! ParseInt(argPtr, valuePtr, options.largeThreshold)) {
+                return false;
+            }
+        } else {
+            cout << "unknown option: " << argPtr << "\n";
+            Usage(argv[0]);
+            return false;
+        }
+    }
+    options.entries       = std::max<uint64_t>(1, options.entries);
+    options.dirs          = std::max<uint64_t>(1, options.dirs);
+    options.lookupSamples = std::min(options.entries,
+        std::max<uint64_t>(1, options.lookupSamples));
+    options.readdirPageSize = std::max<size_t>(1, options.readdirPageSize);
+    return true;
+}
+
+    string
+MakeName(
+    const char* prefixPtr,
+    uint64_t    index)
+{
+    char buf[64];
+    snprintf(buf, sizeof(buf), "%s%llu", prefixPtr,
+        (unsigned long long)index);
+    return string(buf);
+}
+
+    int
+CreateParentDirs(
+    NamespaceStore& store,
+    const Options&  options,
+    vector<fid_t>&  parentFids)
+{
+    parentFids.clear();
+    if (options.dirs == 1) {
+        parentFids.push_back(store.GetRootFid());
+        return 0;
+    }
+    parentFids.reserve(size_t(options.dirs));
+    for (uint64_t i = 0; i < options.dirs; i++) {
+        CreateResult result;
+        const int status = store.Create(
+            store.GetRootFid(), MakeName("d_", i), kInodeTypeDir, &result);
+        if (status != 0) {
+            cout << "mkdir failed index=" << i << " status=" << status << "\n";
+            return status;
+        }
+        parentFids.push_back(result.fid);
+    }
+    store.CommitThrough(store.GetLastTxn());
+    return 0;
+}
+
+    int
+BenchmarkCreate(
+    NamespaceStore&     store,
+    const Options&      options,
+    const vector<fid_t>& parentFids)
+{
+    const uint64_t startUsec = NowUsec();
+    for (uint64_t i = 0; i < options.entries; i++) {
+        const fid_t parentFid = parentFids[size_t(i % parentFids.size())];
+        const int status = store.ApplyCreatePending(
+            parentFid, MakeName("f_", i), kInodeTypeFile);
+        if (status != 0) {
+            cout << "create failed index=" << i << " status=" << status <<
+                "\n";
+            return status;
+        }
+    }
+    const uint64_t createUsec = NowUsec() - startUsec;
+    cout << "create count=" << options.entries <<
+        " usec=" << createUsec <<
+        " ops_per_sec=" << OpsPerSec(options.entries, createUsec) << "\n";
+
+    const uint64_t commitStartUsec = NowUsec();
+    store.CommitThrough(store.GetLastTxn());
+    const uint64_t commitUsec = NowUsec() - commitStartUsec;
+    cout << "commit txn=" << store.GetCommittedTxn() <<
+        " usec=" << commitUsec << "\n";
+    return 0;
+}
+
+    int
+BenchmarkLookup(
+    const NamespaceStore& store,
+    const Options&        options,
+    const vector<fid_t>&  parentFids)
+{
+    uint64_t found = 0;
+    const uint64_t stride = std::max<uint64_t>(
+        1, options.entries / options.lookupSamples);
+    const uint64_t startUsec = NowUsec();
+    for (uint64_t n = 0, i = 0; n < options.lookupSamples; n++,
+            i = (i + stride) % options.entries) {
+        const fid_t parentFid = parentFids[size_t(i % parentFids.size())];
+        LookupResult result;
+        const int status = store.Lookup(parentFid, MakeName("f_", i), result);
+        if (status == 0) {
+            found++;
+        } else {
+            cout << "lookup failed index=" << i << " status=" << status <<
+                "\n";
+            return status;
+        }
+    }
+    const uint64_t lookupUsec = NowUsec() - startUsec;
+    cout << "lookup count=" << options.lookupSamples <<
+        " found=" << found <<
+        " usec=" << lookupUsec <<
+        " ops_per_sec=" << OpsPerSec(options.lookupSamples, lookupUsec) <<
+        "\n";
+    return 0;
+}
+
+    int
+BenchmarkReaddir(
+    const NamespaceStore& store,
+    const Options&        options,
+    const vector<fid_t>&  parentFids)
+{
+    uint64_t entryCount = 0;
+    uint64_t pageCount  = 0;
+    const uint64_t startUsec = NowUsec();
+    for (size_t i = 0; i < parentFids.size(); i++) {
+        ReaddirCookie cookie;
+        const ReaddirCookie* cookiePtr = 0;
+        do {
+            ReaddirResult result;
+            const int status = store.Readdir(parentFids[i], cookiePtr,
+                options.readdirPageSize, result);
+            if (status != 0) {
+                cout << "readdir failed parent_index=" << i <<
+                    " status=" << status << "\n";
+                return status;
+            }
+            entryCount += result.entries.size();
+            pageCount++;
+            cookie = result.nextCookie;
+            cookiePtr = result.moreEntriesFlag ? &cookie : 0;
+            if (! result.moreEntriesFlag) {
+                break;
+            }
+        } while (true);
+    }
+    const uint64_t readdirUsec = NowUsec() - startUsec;
+    cout << "readdir entries=" << entryCount <<
+        " pages=" << pageCount <<
+        " usec=" << readdirUsec <<
+        " entries_per_sec=" << OpsPerSec(entryCount, readdirUsec) << "\n";
+    return entryCount == options.entries ? 0 : -EIO;
+}
+
+} // namespace
+
+    int
+main(
+    int    argc,
+    char** argv)
+{
+    Options options;
+    if (! ParseOptions(argc, argv, options)) {
+        return 1;
+    }
+
+    Config cfg;
+    cfg.enabledFlag       = true;
+    cfg.dirLargeThreshold = options.largeThreshold;
+    NamespaceStore store(cfg);
+    vector<fid_t> parentFids;
+
+    cout << "namespacev2bench entries=" << options.entries <<
+        " dirs=" << options.dirs <<
+        " threshold=" << options.largeThreshold <<
+        " lookup_samples=" << options.lookupSamples <<
+        " readdir_page=" << options.readdirPageSize << "\n";
+
+    uint64_t startUsec = NowUsec();
+    int status = CreateParentDirs(store, options, parentFids);
+    uint64_t setupUsec = NowUsec() - startUsec;
+    if (status != 0) {
+        return 1;
+    }
+    cout << "setup dirs=" << parentFids.size() <<
+        " usec=" << setupUsec << "\n";
+
+    status = BenchmarkCreate(store, options, parentFids);
+    if (status != 0) {
+        return 1;
+    }
+    status = BenchmarkLookup(store, options, parentFids);
+    if (status != 0) {
+        return 1;
+    }
+    status = BenchmarkReaddir(store, options, parentFids);
+    if (status != 0) {
+        return 1;
+    }
+    return 0;
+}
diff --git a/src/cc/meta/namespacev2test_main.cc b/src/cc/meta/namespacev2test_main.cc
new file mode 100644
index 000000000..071da6aeb
--- /dev/null
+++ b/src/cc/meta/namespacev2test_main.cc
@@ -0,0 +1,886 @@
+//---------------------------------------------------------- -*- Mode: C++ -*-
+// $Id$
+//
+// Unit tests for RFC-0001 NamespaceV2 scaffolding.
+//
+// Copyright 2026 Quantcast Corporation. All rights reserved.
+//
+// This file is part of Kosmos File System (KFS).
+//
+// Licensed under the Apache License, Version 2.0.
+//
+//----------------------------------------------------------------------------
+
+#include "NamespaceV2.h"
+
+#include "common/Properties.h"
+
+#include <algorithm>
+#include <atomic>
+#include <errno.h>
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <vector>
+
+using namespace KFS;
+using namespace KFS::NamespaceV2;
+using std::cout;
+using std::string;
+using std::vector;
+
+namespace
+{
+
+int gErrorCount = 0;
+
+    void
+Check(
+    bool        okFlag,
+    const char* msgPtr)
+{
+    if (! okFlag) {
+        cout << "FAILED: " << msgPtr << "\n";
+        gErrorCount++;
+    }
+}
+
+    string
+MakeName(
+    const char* prefixPtr,
+    int         first,
+    int         second = -1)
+{
+    std::ostringstream os;
+    os << prefixPtr << first;
+    if (second >= 0) {
+        os << "_" << second;
+    }
+    return os.str();
+}
+
+    void
+TestConfig()
+{
+    Properties props;
+    props.setValue("metaServer.namespaceV2.enabled", "1");
+    props.setValue("metaServer.namespaceV2.rpcEnabled", "1");
+    props.setValue("metaServer.dir.largeThreshold", "8");
+    props.setValue("metaServer.dir.promoteMaxWallMs", "77");
+    props.setValue("metaServer.namespaceV2.dirShardCount", "16");
+    SetParameters(props);
+    const Config& cfg = GetConfig();
+    Check(cfg.enabledFlag, "namespace v2 enabled flag");
+    Check(cfg.rpcEnabledFlag, "namespace v2 rpc enabled flag");
+    Check(cfg.dirLargeThreshold == 8, "large threshold");
+    Check(cfg.dirPromoteMaxWallMs == 77, "promotion wall limit");
+    Check(cfg.dirShardCount == 16, "dir shard count");
+}
+
+    void
+TestNameKey()
+{
+    const NameKey a("a");
+    const NameKey b("b");
+    Check(a == NameKey(a.hash, "a"), "name key equality");
+    Check((a < b) || (b < a), "name key strict ordering");
+}
+
+    void
+TestPendingCommittedVisibility()
+{
+    DirNode dir(4);
+    Check(dir.InsertPending("f", 100, 10) == 0, "insert pending");
+    Check(dir.LookupCommitted("f", 9) == 0, "pending create invisible");
+    dir.CommitThrough(10);
+    const VersionedDirEntry* entry = dir.LookupCommitted("f", 10);
+    Check(entry && entry->childFid == 100, "committed create visible");
+    fid_t deletedFid = -1;
+    Check(dir.DeletePending("f", 20, &deletedFid) == 0, "delete pending");
+    Check(deletedFid == 100, "delete returns child fid");
+    Check(dir.LookupCommitted("f", 19) != 0, "pending delete still visible");
+    Check(dir.InsertPending("f", 101, 21) == -EEXIST,
+        "recreate rejected while delete pending");
+    dir.CommitThrough(20);
+    Check(dir.LookupCommitted("f", 20) == 0, "committed delete invisible");
+    Check(dir.InsertPending("f", 101, 30) == 0,
+        "recreate after delete commit");
+    Check(dir.LookupCommitted("f", 29) == 0, "recreate pending invisible");
+    dir.CommitThrough(30);
+    entry = dir.LookupCommitted("f", 30);
+    Check(entry && entry->childFid == 101, "recreate committed visible");
+}
+
+    void
+TestSmallCookieInvalidation()
+{
+    DirNode dir(10);
+    Check(dir.InsertPending("b", 2, 1) == 0, "insert b");
+    Check(dir.InsertPending("a", 1, 2) == 0, "insert a");
+    dir.CommitThrough(2);
+    ReaddirResult res;
+    Check(dir.ReaddirCommitted(2, 0, 1, res) == 0, "small readdir first page");
+    Check(res.entries.size() == 1, "small readdir page size");
+    const ReaddirCookie oldCookie = res.nextCookie;
+    Check(dir.InsertPending("c", 3, 3) == 0, "insert c invalidates small cookie");
+    dir.CommitThrough(3);
+    Check(dir.ReaddirCommitted(3, &oldCookie, 1, res) == -EINVAL,
+        "old small cookie rejected");
+}
+
+    void
+TestPromotion()
+{
+    DirNode dir(2);
+    Check(dir.InsertPending("a", 1, 1) == 0, "promotion insert a");
+    Check(dir.InsertPending("b", 2, 2) == 0, "promotion insert b");
+    Check(! dir.IsLarge(), "at threshold still small");
+    const uint64_t genBefore = dir.GetGeneration();
+    Check(dir.InsertPending("c", 3, 3) == 0, "promotion insert c");
+    Check(dir.IsLarge(), "promoted to large");
+    Check(dir.GetGeneration() > genBefore, "promotion increments generation");
+    dir.CommitThrough(3);
+    Check(dir.LookupCommitted("a", 3) != 0, "large lookup a");
+    Check(dir.LookupCommitted("b", 3) != 0, "large lookup b");
+    Check(dir.LookupCommitted("c", 3) != 0, "large lookup c");
+}
+
+    void
+TestLargeCookieStableAcrossInsert()
+{
+    DirNode dir(1);
+    Check(dir.InsertPending("a", 1, 1) == 0, "large insert a");
+    Check(dir.InsertPending("b", 2, 2) == 0, "large insert b");
+    Check(dir.InsertPending("d", 4, 3) == 0, "large insert d");
+    dir.CommitThrough(3);
+    Check(dir.IsLarge(), "large state");
+    ReaddirResult res;
+    Check(dir.ReaddirCommitted(3, 0, 1, res) == 0, "large first page");
+    Check(res.entries.size() == 1, "large page size");
+    const ReaddirCookie cookie = res.nextCookie;
+    const uint64_t gen = dir.GetGeneration();
+    Check(dir.InsertPending("c", 3, 4) == 0, "large insert keeps cookie valid");
+    dir.CommitThrough(4);
+    Check(dir.GetGeneration() == gen, "large create does not bump generation");
+    Check(dir.ReaddirCommitted(4, &cookie, 10, res) == 0,
+        "old large NameKey cookie remains valid");
+    Check(! res.entries.empty(), "large resume returns entries");
+    for (size_t i = 0; i < res.entries.size(); i++) {
+        Check(cookie.lastKey < res.entries[i].key,
+            "large resume returns entries after last key");
+    }
+}
+
+    void
+TestInodeTable()
+{
+    InodeTable table;
+    Check(table.Insert(InodeRecord(10, 1, kInodeTypeFile, 5)),
+        "inode insert");
+    Check(! table.Insert(InodeRecord(10, 1, kInodeTypeFile, 6)),
+        "duplicate inode insert rejected");
+    Check(table.FindCommitted(10, 4) == 0, "pending inode invisible");
+    Check(table.FindCommitted(10, 5) != 0, "committed inode visible");
+    Check(table.MarkDeleted(10, 8), "inode mark deleted");
+    Check(table.FindCommitted(10, 7) != 0, "pending inode delete visible");
+    Check(table.FindCommitted(10, 8) == 0, "committed inode delete invisible");
+}
+
+    void
+TestNamespaceStoreBasic()
+{
+    Config cfg;
+    cfg.dirLargeThreshold = 2;
+    NamespaceStore store(cfg);
+    LookupResult lookup;
+    Check(store.GetRootFid() == ROOTFID, "namespace root fid");
+    Check(store.GetInodeCount() == 1, "namespace root inode");
+    Check(store.GetDirCount() == 1, "namespace root dir");
+    Check(store.Lookup(ROOTFID, "f", lookup) == -ENOENT,
+        "namespace missing lookup");
+
+    CreateResult create;
+    Check(store.Create(ROOTFID, "f", kInodeTypeFile, &create) == 0,
+        "namespace create file");
+    Check(create.fid > ROOTFID, "namespace create fid assigned");
+    Check(store.GetLastTxn() == create.txnId, "namespace txn assigned");
+    Check(store.Lookup(ROOTFID, "f", lookup) == -ENOENT,
+        "namespace pending create invisible");
+    Check(store.Create(ROOTFID, "f", kInodeTypeFile, 0) == -EEXIST,
+        "namespace duplicate pending create rejected");
+    Check(store.GetLastTxn() == create.txnId,
+        "namespace failed create does not consume txn");
+
+    store.CommitThrough(create.txnId);
+    Check(store.Lookup(ROOTFID, "f", lookup) == 0 &&
+            lookup.fid == create.fid && lookup.type == kInodeTypeFile,
+        "namespace committed create visible");
+    ReaddirResult readdir;
+    Check(store.Readdir(ROOTFID, 0, 10, readdir) == 0 &&
+            readdir.entries.size() == 1 &&
+            readdir.entries[0].childFid == create.fid,
+        "namespace readdir committed create");
+
+    TxnId deleteTxn = 0;
+    Check(store.Remove(ROOTFID, "f", &deleteTxn) == 0,
+        "namespace remove file");
+    Check(deleteTxn > create.txnId, "namespace remove txn assigned");
+    Check(store.Lookup(ROOTFID, "f", lookup) == 0,
+        "namespace pending remove still visible");
+    Check(store.Create(ROOTFID, "f", kInodeTypeFile, 0) == -EEXIST,
+        "namespace recreate rejected while remove pending");
+    Check(store.GetLastTxn() == deleteTxn,
+        "namespace failed recreate does not consume txn");
+    store.CommitThrough(deleteTxn);
+    Check(store.Lookup(ROOTFID, "f", lookup) == -ENOENT,
+        "namespace committed remove invisible");
+    Check(store.Create(ROOTFID, "f", kInodeTypeFile, 0) == 0,
+        "namespace recreate after committed remove");
+}
+
+    void
+TestNamespaceStoreDirectory()
+{
+    Config cfg;
+    cfg.dirLargeThreshold = 1;
+    NamespaceStore store(cfg);
+    CreateResult dir;
+    Check(store.Create(ROOTFID, "d", kInodeTypeDir, &dir) == 0,
+        "namespace mkdir");
+    Check(store.GetDirCount() == 2, "namespace dir table insert pending dir");
+    Check(store.Create(dir.fid, "before_commit", kInodeTypeFile, 0) == -ENOENT,
+        "namespace pending dir not usable");
+    Check(store.GetLastTxn() == dir.txnId,
+        "namespace failed child create does not consume txn");
+    store.CommitThrough(dir.txnId);
+
+    CreateResult first;
+    Check(store.Create(dir.fid, "a", kInodeTypeFile, &first) == 0,
+        "namespace child create");
+    store.CommitThrough(first.txnId);
+    LookupResult lookup;
+    Check(store.Lookup(dir.fid, "a", lookup) == 0 && lookup.fid == first.fid,
+        "namespace child lookup");
+
+    CreateResult second;
+    Check(store.Create(dir.fid, "b", kInodeTypeFile, &second) == 0,
+        "namespace child create promotes dir");
+    store.CommitThrough(second.txnId);
+    ReaddirResult readdir;
+    Check(store.Readdir(dir.fid, 0, 10, readdir) == 0 &&
+            readdir.entries.size() == 2,
+        "namespace child readdir after promotion");
+}
+
+    void
+TestNamespaceStorePathAndRmdir()
+{
+    Config cfg;
+    NamespaceStore store(cfg);
+    CreateResult dir;
+    Check(store.Create(ROOTFID, "d", kInodeTypeDir, &dir,
+            10, 20, 0755, 0, 100) == 0,
+        "namespace path mkdir d");
+    store.CommitThrough(dir.txnId);
+
+    CreateResult child;
+    Check(store.Create(dir.fid, "c", kInodeTypeDir, &child,
+            11, 21, 0750, 0, 101) == 0,
+        "namespace path mkdir child");
+    store.CommitThrough(child.txnId);
+
+    CreateResult file;
+    Check(store.Create(child.fid, "f", kInodeTypeFile, &file,
+            12, 22, 0644, 2, 102) == 0,
+        "namespace path create file");
+    store.CommitThrough(file.txnId);
+
+    LookupResult lookup;
+    Check(store.LookupPath(ROOTFID, "/d/c/f", lookup) == 0 &&
+            lookup.fid == file.fid && lookup.user == 12 &&
+            lookup.group == 22 && lookup.mode == 0644 &&
+            lookup.numReplicas == 2,
+        "namespace lookup path attrs");
+    Check(store.LookupPath(ROOTFID, "/d/c/f/x", lookup) == -ENOTDIR,
+        "namespace lookup below file rejected");
+    Check(store.RemoveFile(dir.fid, "c", 0) == -EISDIR,
+        "namespace remove dir as file rejected");
+    Check(store.Rmdir(child.fid, "f", 0) == -ENOTDIR,
+        "namespace rmdir file rejected");
+    Check(store.Rmdir(dir.fid, "c", 0) == -ENOTEMPTY,
+        "namespace rmdir non-empty rejected");
+
+    TxnId txnId = 0;
+    Check(store.RemoveFile(child.fid, "f", &txnId) == 0,
+        "namespace remove child file");
+    store.CommitThrough(txnId);
+    Check(store.Rmdir(dir.fid, "c", &txnId) == 0,
+        "namespace rmdir empty child");
+    store.CommitThrough(txnId);
+    Check(store.LookupPath(ROOTFID, "/d/c", lookup) == -ENOENT,
+        "namespace rmdir committed invisible");
+}
+
+
+    void
+TestNamespaceStoreRename()
+{
+    Config cfg;
+    cfg.dirLargeThreshold = 2;
+    NamespaceStore store(cfg);
+
+    CreateResult d;
+    Check(store.Create(ROOTFID, "d", kInodeTypeDir, &d) == 0,
+        "rename mkdir d");
+    store.CommitThrough(d.txnId);
+    CreateResult e;
+    Check(store.Create(ROOTFID, "e", kInodeTypeDir, &e) == 0,
+        "rename mkdir e");
+    store.CommitThrough(e.txnId);
+
+    CreateResult file;
+    Check(store.Create(d.fid, "f", kInodeTypeFile, &file) == 0,
+        "rename create source file");
+    store.CommitThrough(file.txnId);
+
+    TxnId txnId = 0;
+    fid_t srcFid = -1;
+    Check(store.Rename(d.fid, "f", "g", false, &txnId, &srcFid) == 0 &&
+            srcFid == file.fid,
+        "rename same dir file");
+    store.CommitThrough(txnId);
+    LookupResult lookup;
+    Check(store.Lookup(d.fid, "f", lookup) == -ENOENT,
+        "rename source removed");
+    Check(store.Lookup(d.fid, "g", lookup) == 0 && lookup.fid == file.fid,
+        "rename target visible");
+
+    Check(store.Rename(d.fid, "g", "/e/h", false, &txnId, &srcFid) == 0,
+        "rename cross dir absolute target");
+    store.CommitThrough(txnId);
+    Check(store.Lookup(d.fid, "g", lookup) == -ENOENT,
+        "cross dir source removed");
+    Check(store.Lookup(e.fid, "h", lookup) == 0 && lookup.fid == file.fid,
+        "cross dir target visible");
+
+    CreateResult other;
+    Check(store.Create(e.fid, "z", kInodeTypeFile, &other) == 0,
+        "rename create overwrite target");
+    store.CommitThrough(other.txnId);
+    Check(store.Rename(e.fid, "h", "z", false, 0, 0) == -EEXIST,
+        "rename overwrite disabled");
+    Check(store.Rename(e.fid, "h", "z", true, &txnId, &srcFid) == 0,
+        "rename overwrite file");
+    store.CommitThrough(txnId);
+    Check(store.Lookup(e.fid, "h", lookup) == -ENOENT,
+        "overwrite source removed");
+    Check(store.Lookup(e.fid, "z", lookup) == 0 && lookup.fid == file.fid,
+        "overwrite target replaced");
+
+    CreateResult x;
+    Check(store.Create(ROOTFID, "x", kInodeTypeFile, &x) == 0,
+        "rename create type mismatch file");
+    store.CommitThrough(x.txnId);
+    CreateResult y;
+    Check(store.Create(ROOTFID, "y", kInodeTypeDir, &y) == 0,
+        "rename create type mismatch dir");
+    store.CommitThrough(y.txnId);
+    Check(store.Rename(ROOTFID, "x", "y", true, 0, 0) == -EISDIR,
+        "rename file over dir rejected");
+
+    CreateResult a;
+    Check(store.Create(ROOTFID, "a", kInodeTypeDir, &a) == 0,
+        "rename create ancestor dir");
+    store.CommitThrough(a.txnId);
+    CreateResult b;
+    Check(store.Create(a.fid, "b", kInodeTypeDir, &b) == 0,
+        "rename create descendant dir");
+    store.CommitThrough(b.txnId);
+    Check(store.Rename(ROOTFID, "a", "/a/b/c", false, 0, 0) == -EINVAL,
+        "rename dir into descendant rejected");
+
+    CreateResult p;
+    Check(store.Create(ROOTFID, "p", kInodeTypeDir, &p) == 0,
+        "rename create source dir");
+    store.CommitThrough(p.txnId);
+    CreateResult q;
+    Check(store.Create(ROOTFID, "q", kInodeTypeDir, &q) == 0,
+        "rename create non-empty target dir");
+    store.CommitThrough(q.txnId);
+    CreateResult qChild;
+    Check(store.Create(q.fid, "child", kInodeTypeFile, &qChild) == 0,
+        "rename create target child");
+    store.CommitThrough(qChild.txnId);
+    Check(store.Rename(ROOTFID, "p", "q", true, 0, 0) == -ENOTEMPTY,
+        "rename over non-empty dir rejected");
+}
+
+
+    void
+TestNamespaceStoreCheckpoint()
+{
+    Config cfg;
+    cfg.dirLargeThreshold = 1;
+    NamespaceStore store(cfg);
+
+    CreateResult d;
+    Check(store.Create(ROOTFID, "d", kInodeTypeDir, &d,
+            10, 20, 0755, 0, 100) == 0,
+        "checkpoint mkdir d");
+    store.CommitThrough(d.txnId);
+    CreateResult e;
+    Check(store.Create(ROOTFID, "e", kInodeTypeDir, &e,
+            11, 21, 0750, 0, 101) == 0,
+        "checkpoint mkdir e");
+    store.CommitThrough(e.txnId);
+
+    CreateResult removed;
+    Check(store.Create(d.fid, "removed", kInodeTypeFile, &removed) == 0,
+        "checkpoint create removed file");
+    store.CommitThrough(removed.txnId);
+    TxnId txnId = 0;
+    Check(store.RemoveFile(d.fid, "removed", &txnId) == 0,
+        "checkpoint remove file");
+    store.CommitThrough(txnId);
+
+    CreateResult file;
+    Check(store.Create(d.fid, "f", kInodeTypeFile, &file,
+            12, 22, 0644, 3, 102) == 0,
+        "checkpoint create file");
+    store.CommitThrough(file.txnId);
+    Check(store.Rename(d.fid, "f", "/e/g", false, &txnId, 0) == 0,
+        "checkpoint rename file");
+    store.CommitThrough(txnId);
+
+    LookupResult originalMoved;
+    Check(store.LookupPath(ROOTFID, "/e/g", originalMoved) == 0,
+        "checkpoint original moved lookup");
+
+    std::stringstream image;
+    Check(store.SaveCheckpoint(image) == 0,
+        "checkpoint save");
+
+    NamespaceStore restored(cfg);
+    Check(restored.LoadCheckpoint(image) == 0,
+        "checkpoint load");
+    Check(restored.GetCommittedTxn() == store.GetCommittedTxn(),
+        "checkpoint committed txn restored");
+    Check(restored.GetLastTxn() == store.GetLastTxn(),
+        "checkpoint last txn restored");
+
+    LookupResult lookup;
+    Check(restored.LookupPath(ROOTFID, "/e/g", lookup) == 0 &&
+            lookup.fid == file.fid && lookup.user == 12 &&
+            lookup.group == 22 && lookup.mode == 0644 &&
+            lookup.numReplicas == 3 &&
+            lookup.parentGeneration == originalMoved.parentGeneration,
+        "checkpoint restored moved file attrs");
+    Check(restored.LookupPath(ROOTFID, "/d/f", lookup) == -ENOENT,
+        "checkpoint old rename source absent");
+    Check(restored.LookupPath(ROOTFID, "/d/removed", lookup) == -ENOENT,
+        "checkpoint removed file absent");
+
+    ReaddirResult readdir;
+    Check(restored.Readdir(e.fid, 0, 10, readdir) == 0 &&
+            readdir.entries.size() == 1 &&
+            readdir.entries[0].childFid == file.fid,
+        "checkpoint restored dir entries");
+
+    const TxnId lastTxn = restored.GetLastTxn();
+    CreateResult after;
+    Check(restored.Create(ROOTFID, "after", kInodeTypeFile, &after) == 0,
+        "checkpoint create after restore");
+    Check(after.txnId == lastTxn + 1 && after.fid > file.fid,
+        "checkpoint seeds continue after restore");
+}
+
+    void
+TestNamespaceStoreEditLog()
+{
+    Config cfg;
+    cfg.dirLargeThreshold = 1;
+    NamespaceStore store(cfg);
+
+    CreateResult d;
+    Check(store.Create(ROOTFID, "d", kInodeTypeDir, &d,
+            10, 20, 0755, 0, 100) == 0,
+        "edit log mkdir d");
+    store.CommitThrough(d.txnId);
+    CreateResult e;
+    Check(store.Create(ROOTFID, "e", kInodeTypeDir, &e,
+            11, 21, 0750, 0, 101) == 0,
+        "edit log mkdir e");
+    store.CommitThrough(e.txnId);
+
+    std::stringstream checkpoint;
+    Check(store.SaveCheckpoint(checkpoint) == 0,
+        "edit log checkpoint save");
+
+    std::stringstream logs;
+    CreateResult tmp;
+    Check(store.Create(d.fid, "tmp", kInodeTypeFile, &tmp,
+            30, 40, 0600, 1, 200) == 0,
+        "edit log create tmp");
+    EditLogRecord tmpCreate;
+    tmpCreate.type = EditLogRecord::kCreate;
+    tmpCreate.txnId = tmp.txnId;
+    tmpCreate.parentFid = d.fid;
+    tmpCreate.name = "tmp";
+    tmpCreate.fid = tmp.fid;
+    tmpCreate.inodeType = kInodeTypeFile;
+    tmpCreate.user = 30;
+    tmpCreate.group = 40;
+    tmpCreate.mode = 0600;
+    tmpCreate.numReplicas = 1;
+    tmpCreate.mtime = 200;
+    Check(WriteEditLog(logs, tmpCreate) == 0,
+        "edit log write tmp create");
+    store.CommitThrough(tmp.txnId);
+
+    TxnId removeTxn = 0;
+    Check(store.RemoveFile(d.fid, "tmp", &removeTxn) == 0,
+        "edit log remove tmp");
+    EditLogRecord tmpRemove;
+    tmpRemove.type = EditLogRecord::kRemove;
+    tmpRemove.txnId = removeTxn;
+    tmpRemove.parentFid = d.fid;
+    tmpRemove.name = "tmp";
+    tmpRemove.inodeType = kInodeTypeFile;
+    Check(WriteEditLog(logs, tmpRemove) == 0,
+        "edit log write tmp remove");
+    store.CommitThrough(removeTxn);
+
+    CreateResult file;
+    Check(store.Create(d.fid, "f", kInodeTypeFile, &file,
+            31, 41, 0644, 3, 201) == 0,
+        "edit log create file");
+    EditLogRecord fileCreate;
+    fileCreate.type = EditLogRecord::kCreate;
+    fileCreate.txnId = file.txnId;
+    fileCreate.parentFid = d.fid;
+    fileCreate.name = "f";
+    fileCreate.fid = file.fid;
+    fileCreate.inodeType = kInodeTypeFile;
+    fileCreate.user = 31;
+    fileCreate.group = 41;
+    fileCreate.mode = 0644;
+    fileCreate.numReplicas = 3;
+    fileCreate.mtime = 201;
+    Check(WriteEditLog(logs, fileCreate) == 0,
+        "edit log write file create");
+    store.CommitThrough(file.txnId);
+
+    TxnId renameTxn = 0;
+    fid_t srcFid = -1;
+    Check(store.Rename(d.fid, "f", "/e/g", false,
+            &renameTxn, &srcFid) == 0 && srcFid == file.fid,
+        "edit log rename file");
+    EditLogRecord rename;
+    rename.type = EditLogRecord::kRename;
+    rename.txnId = renameTxn;
+    rename.parentFid = d.fid;
+    rename.name = "f";
+    rename.fid = srcFid;
+    rename.newPath = "/e/g";
+    rename.overwriteFlag = false;
+    Check(WriteEditLog(logs, rename) == 0,
+        "edit log write rename");
+    store.CommitThrough(renameTxn);
+
+    NamespaceStore restored(cfg);
+    Check(restored.LoadCheckpoint(checkpoint) == 0,
+        "edit log checkpoint load");
+    Check(restored.ApplyEditLog(logs) == 0,
+        "edit log replay stream");
+
+    LookupResult lookup;
+    Check(restored.LookupPath(ROOTFID, "/e/g", lookup) == 0 &&
+            lookup.fid == file.fid && lookup.user == 31 &&
+            lookup.group == 41 && lookup.mode == 0644 &&
+            lookup.numReplicas == 3,
+        "edit log replay moved file attrs");
+    Check(restored.LookupPath(ROOTFID, "/d/f", lookup) == -ENOENT,
+        "edit log replay old rename source absent");
+    Check(restored.LookupPath(ROOTFID, "/d/tmp", lookup) == -ENOENT,
+        "edit log replay removed file absent");
+    Check(restored.GetCommittedTxn() == store.GetCommittedTxn(),
+        "edit log committed txn restored");
+    Check(restored.GetLastTxn() == store.GetLastTxn(),
+        "edit log last txn restored");
+
+    CreateResult after;
+    Check(restored.Create(ROOTFID, "after_log", kInodeTypeFile, &after) == 0,
+        "edit log create after replay");
+    Check(after.txnId == store.GetLastTxn() + 1 && after.fid > file.fid,
+        "edit log seeds continue after replay");
+
+    EditLogRecord parsed;
+    Check(ReadEditLog("namespacev2_edit 1 create 0 3 4 0 1 1 0 1 1 61",
+            parsed) == -EINVAL,
+        "edit log rejects zero txn");
+}
+
+    void
+TestNamespaceStoreEditLogFailedCreateNoop()
+{
+    Config cfg;
+    NamespaceStore store(cfg);
+
+    fid_t fid = -1;
+    TxnId txnId = 0;
+    store.ReserveCreateIds(fid, txnId);
+    EditLogRecord first;
+    first.type = EditLogRecord::kCreate;
+    first.txnId = txnId;
+    first.parentFid = ROOTFID;
+    first.name = "dup";
+    first.fid = fid;
+    first.inodeType = kInodeTypeFile;
+    first.mode = 0644;
+    Check(store.ApplyEditLog(first) == 0,
+        "edit log first create succeeds");
+
+    fid_t failedFid = -1;
+    TxnId failedTxnId = 0;
+    store.ReserveCreateIds(failedFid, failedTxnId);
+    EditLogRecord duplicate(first);
+    duplicate.txnId = failedTxnId;
+    duplicate.fid = failedFid;
+    Check(store.ApplyEditLog(duplicate) == -EEXIST,
+        "edit log duplicate create fails");
+    Check(store.GetCommittedTxn() == failedTxnId,
+        "edit log failed create commits no-op txn");
+    Check(store.GetLastTxn() == failedTxnId,
+        "edit log failed create advances txn seed");
+
+    LookupResult lookup;
+    Check(store.Lookup(ROOTFID, "dup", lookup) == 0 && lookup.fid == fid,
+        "edit log failed create keeps original entry");
+
+    CreateResult after;
+    Check(store.Create(ROOTFID, "after_failed", kInodeTypeFile, &after) == 0,
+        "edit log create after failed no-op");
+    Check(after.txnId == failedTxnId + 1 && after.fid > failedFid,
+        "edit log create continues after failed no-op");
+}
+
+
+    void
+TestNamespaceStoreEditLogBatchCreateCommit()
+{
+    Config cfg;
+    NamespaceStore store(cfg);
+
+    fid_t parentFid = -1;
+    TxnId parentTxn = 0;
+    store.ReserveCreateIds(parentFid, parentTxn);
+    EditLogRecord parent;
+    parent.type = EditLogRecord::kCreate;
+    parent.txnId = parentTxn;
+    parent.parentFid = ROOTFID;
+    parent.name = "batch_parent";
+    parent.fid = parentFid;
+    parent.inodeType = kInodeTypeDir;
+    parent.mode = 0755;
+    Check(store.ApplyCreate(parent.parentFid, parent.name, parent.inodeType,
+            parent.fid, parent.txnId, kKfsUserRoot, kKfsGroupRoot,
+            parent.mode, 0, 0, false, false) == 0,
+        "batch parent create succeeds");
+    Check(store.GetCommittedTxn() < parentTxn,
+        "edit log batch parent not globally committed yet");
+
+    fid_t childFid = -1;
+    TxnId childTxn = 0;
+    store.ReserveCreateIds(childFid, childTxn);
+    EditLogRecord child;
+    child.type = EditLogRecord::kCreate;
+    child.txnId = childTxn;
+    child.parentFid = parentFid;
+    child.name = "child";
+    child.fid = childFid;
+    child.inodeType = kInodeTypeDir;
+    child.mode = 0755;
+    Check(store.ApplyCreate(child.parentFid, child.name, child.inodeType,
+            child.fid, child.txnId, kKfsUserRoot, kKfsGroupRoot,
+            child.mode, 0, 0, false, false) == 0,
+        "batch child sees prior parent create");
+
+    store.CommitThroughRange(parentTxn, childTxn);
+    LookupResult lookup;
+    Check(store.Lookup(parentFid, "child", lookup) == 0 &&
+            lookup.fid == childFid,
+        "edit log batch committed child lookup");
+    Check(store.GetCommittedTxn() == childTxn,
+        "edit log batch commit through last txn");
+}
+
+
+    void
+TestNamespaceStoreConcurrentShardLocks()
+{
+    Config cfg;
+    cfg.dirLargeThreshold = 8;
+    NamespaceStore store(cfg);
+
+    const int kThreads = 8;
+    const int kFilesPerThread = 100;
+    vector<fid_t> parents;
+    parents.reserve(kThreads);
+    for (int t = 0; t < kThreads; ++t) {
+        CreateResult dir;
+        Check(store.Create(ROOTFID, MakeName("cd", t),
+                kInodeTypeDir, &dir) == 0,
+            "concurrent mkdir parent");
+        store.CommitThrough(dir.txnId);
+        parents.push_back(dir.fid);
+    }
+
+    std::atomic<int> failures(0);
+    vector<std::thread> threads;
+    for (int t = 0; t < kThreads; ++t) {
+        threads.push_back(std::thread([&store, &parents, &failures, t]() {
+            for (int i = 0; i < kFilesPerThread; ++i) {
+                CreateResult create;
+                const int status = store.Create(parents[t],
+                    MakeName("f", t, i), kInodeTypeFile, &create);
+                if (status != 0) {
+                    ++failures;
+                    continue;
+                }
+                store.CommitThrough(create.txnId);
+            }
+        }));
+    }
+    for (size_t i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    Check(failures.load() == 0, "concurrent create threads");
+
+    LookupResult lookup;
+    for (int t = 0; t < kThreads; ++t) {
+        for (int i = 0; i < kFilesPerThread; ++i) {
+            Check(store.Lookup(parents[t], MakeName("f", t, i),
+                    lookup) == 0,
+                "concurrent create lookup");
+        }
+    }
+
+    failures = 0;
+    threads.clear();
+    for (int t = 0; t < kThreads; ++t) {
+        threads.push_back(std::thread([&store, &parents, &failures, t]() {
+            for (int i = 0; i < kFilesPerThread; ++i) {
+                TxnId txnId = 0;
+                const int status = store.Rename(parents[t],
+                    MakeName("f", t, i), MakeName("g", t, i),
+                    false, &txnId, 0);
+                if (status != 0) {
+                    ++failures;
+                    continue;
+                }
+                store.CommitThrough(txnId);
+            }
+        }));
+    }
+    for (size_t i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    Check(failures.load() == 0, "concurrent rename threads");
+    for (int t = 0; t < kThreads; ++t) {
+        for (int i = 0; i < kFilesPerThread; ++i) {
+            Check(store.Lookup(parents[t], MakeName("f", t, i),
+                    lookup) == -ENOENT,
+                "concurrent rename old missing");
+            Check(store.Lookup(parents[t], MakeName("g", t, i),
+                    lookup) == 0,
+                "concurrent rename new visible");
+        }
+    }
+
+    failures = 0;
+    threads.clear();
+    for (int t = 0; t < kThreads; ++t) {
+        threads.push_back(std::thread([&store, &parents, &failures, t]() {
+            for (int i = 0; i < kFilesPerThread; ++i) {
+                TxnId txnId = 0;
+                const int status = store.RemoveFile(parents[t],
+                    MakeName("g", t, i), &txnId);
+                if (status != 0) {
+                    ++failures;
+                    continue;
+                }
+                store.CommitThrough(txnId);
+            }
+        }));
+    }
+    for (size_t i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    Check(failures.load() == 0, "concurrent remove threads");
+    for (int t = 0; t < kThreads; ++t) {
+        for (int i = 0; i < kFilesPerThread; ++i) {
+            Check(store.Lookup(parents[t], MakeName("g", t, i),
+                    lookup) == -ENOENT,
+                "concurrent remove missing");
+        }
+    }
+}
+
+
+    void
+TestResourceLockOrdering()
+{
+    vector<ResourceLockKey> locks;
+    locks.push_back(ResourceLockKey(ResourceLockKey::kInode, 10));
+    locks.push_back(ResourceLockKey(ResourceLockKey::kDir, 3, 20));
+    locks.push_back(ResourceLockKey(ResourceLockKey::kSnapshot, 0));
+    locks.push_back(ResourceLockKey(ResourceLockKey::kDir, 2, 30));
+    locks.push_back(ResourceLockKey(ResourceLockKey::kEditLog, 0));
+    std::sort(locks.begin(), locks.end());
+    Check(locks[0].resourceClass == ResourceLockKey::kSnapshot,
+        "snapshot lock first");
+    Check(locks[1].resourceClass == ResourceLockKey::kDir &&
+            locks[1].major == 2,
+        "dir locks sorted by shard");
+    Check(locks[2].resourceClass == ResourceLockKey::kDir &&
+            locks[2].major == 3,
+        "dir locks sorted by shard second");
+    Check(locks[3].resourceClass == ResourceLockKey::kInode,
+        "inode after dir");
+    Check(locks[4].resourceClass == ResourceLockKey::kEditLog,
+        "edit log last");
+}
+
+} // namespace
+
+    int
+main(
+    int    /* argc */,
+    char** /* argv */)
+{
+    TestConfig();
+    TestNameKey();
+    TestPendingCommittedVisibility();
+    TestSmallCookieInvalidation();
+    TestPromotion();
+    TestLargeCookieStableAcrossInsert();
+    TestInodeTable();
+    TestNamespaceStoreBasic();
+    TestNamespaceStoreDirectory();
+    TestNamespaceStorePathAndRmdir();
+    TestNamespaceStoreRename();
+    TestNamespaceStoreCheckpoint();
+    TestNamespaceStoreEditLog();
+    TestNamespaceStoreEditLogFailedCreateNoop();
+    TestNamespaceStoreEditLogBatchCreateCommit();
+    TestNamespaceStoreConcurrentShardLocks();
+    TestResourceLockOrdering();
+    if (gErrorCount != 0) {
+        cout << gErrorCount << " NamespaceV2 tests failed\n";
+        return 1;
+    }
+    cout << "NamespaceV2 tests passed\n";
+    return 0;
+}
diff --git a/src/cc/meta/namespacev2walreplaytest_main.cc b/src/cc/meta/namespacev2walreplaytest_main.cc
new file mode 100644
index 000000000..b15177166
--- /dev/null
+++ b/src/cc/meta/namespacev2walreplaytest_main.cc
@@ -0,0 +1,169 @@
+//---------------------------------------------------------- -*- Mode: C++ -*-
+// Minimal integration test:
+//   write nv2batch WAL line -> Replay::playLine() -> validate NamespaceV2 state.
+//
+// This intentionally bypasses LogWriter / disk IO, and tests the log record
+// format + replay parser + NamespaceV2 apply/commit chain.
+
+#include "Replay.h"
+#include "NamespaceV2.h"
+
+#include "kfsio/Base64.h"
+
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+#include <iostream>
+
+using std::string;
+using std::vector;
+using std::cout;
+
+namespace
+{
+
+static void
+AppendLe(
+    string&   out,
+    uint64_t  v,
+    size_t    bytes)
+{
+    for (size_t i = 0; i < bytes; i++) {
+        out.push_back((char)(v >> (i * 8)));
+    }
+}
+
+static void
+AppendU8(string& out, uint8_t v)  { AppendLe(out, v, 1); }
+static void
+AppendU16(string& out, uint16_t v) { AppendLe(out, v, 2); }
+static void
+AppendU32(string& out, uint32_t v) { AppendLe(out, v, 4); }
+static void
+AppendU64(string& out, uint64_t v) { AppendLe(out, v, 8); }
+static void
+AppendI16(string& out, int16_t v)  { AppendLe(out, (uint16_t)v, 2); }
+static void
+AppendI64(string& out, int64_t v)  { AppendLe(out, (uint64_t)v, 8); }
+
+static string
+EncodeNv2BatchPayload()
+{
+    // Two ops:
+    //  1) mkdir  /d
+    //  2) create /d/f
+    const KFS::fid_t   root = KFS::ROOTFID;
+    const KFS::fid_t   dFid = 1001;
+    const KFS::fid_t   fFid = 1002;
+    // Use contiguous txn ids, and let main() seed committed txn to 0.
+    const uint64_t dTxn = 1;
+    const uint64_t fTxn = 2;
+    const uint32_t user = 1;
+    const uint32_t group = 1;
+    const uint16_t modeDir = 0755;
+    const uint16_t modeFile = 0644;
+    const int16_t  repl = 1;
+    const int64_t  mtime = 123456789;
+
+    string payload;
+    payload.reserve(256);
+
+    // opType=2 mkdir
+    AppendU8(payload, 2);
+    AppendI64(payload, (int64_t)root);
+    AppendI64(payload, (int64_t)dFid);
+    AppendU64(payload, dTxn);
+    AppendU32(payload, user);
+    AppendU32(payload, group);
+    AppendU16(payload, modeDir);
+    AppendI16(payload, 0);
+    AppendI64(payload, mtime);
+    const string dname("d");
+    AppendU16(payload, (uint16_t)dname.size());
+    payload.append(dname);
+
+    // opType=1 create file
+    AppendU8(payload, 1);
+    AppendI64(payload, (int64_t)dFid);
+    AppendI64(payload, (int64_t)fFid);
+    AppendU64(payload, fTxn);
+    AppendU32(payload, user);
+    AppendU32(payload, group);
+    AppendU16(payload, modeFile);
+    AppendI16(payload, repl);
+    AppendI64(payload, mtime);
+    const string fname("f");
+    AppendU16(payload, (uint16_t)fname.size());
+    payload.append(fname);
+
+    return payload;
+}
+
+static string
+Base64Encode(
+    const string& bytes)
+{
+    vector<char> buf((size_t)KFS::Base64::GetEncodedMaxBufSize((int)bytes.size()));
+    const int len = KFS::Base64::Encode(bytes.data(), (int)bytes.size(), &buf[0], true);
+    if (len <= 0) {
+        return string();
+    }
+    return string(&buf[0], len);
+}
+
+static int
+Fail(
+    const char* msg)
+{
+    cout << "FAILED: " << msg << "\n";
+    return 1;
+}
+
+} // anonymous
+
+int
+main()
+{
+    // Ensure namespace v2 store exists in this process.
+    KFS::NamespaceV2::NamespaceStore& store = KFS::NamespaceV2::GetStore();
+    // Simulate "checkpoint committed txn == 0" so that replay can commit a
+    // contiguous txn range starting at 1.
+    store.CommitThroughRange(0, 0);
+    (void)store;
+
+    const string payload = EncodeNv2BatchPayload();
+    const string b64 = Base64Encode(payload);
+    if (b64.empty()) {
+        return Fail("base64 encode");
+    }
+
+    // One nv2batch line + one placeholder, to mimic WAL sequence count.
+    const string line1 = "nv2batch/c/2/b/" + b64 + "\n";
+    const string line2 = "nv2batchc\n";
+
+    // Use a fresh block seq for each line.
+    int status = KFS::replayer.playLine(line1.data(), (int)line1.size(), 1);
+    if (status != 0) {
+        cout << "nv2batch line: " << line1;
+        return Fail("replay nv2batch");
+    }
+    status = KFS::replayer.playLine(line2.data(), (int)line2.size(), 2);
+    if (status != 0) {
+        return Fail("replay nv2batchc");
+    }
+
+    // Verify namespace state after replay commit.
+    KFS::NamespaceV2::LookupResult d;
+    if (KFS::NamespaceV2::GetStore().Lookup(KFS::ROOTFID, "d", d) != 0 ||
+            d.type != KFS::NamespaceV2::kInodeTypeDir) {
+        return Fail("lookup dir d");
+    }
+    KFS::NamespaceV2::LookupResult f;
+    if (KFS::NamespaceV2::GetStore().Lookup(d.fid, "f", f) != 0 ||
+            f.type != KFS::NamespaceV2::kInodeTypeFile) {
+        return Fail("lookup file f");
+    }
+    cout << "NamespaceV2 WAL replay integration test passed\n";
+    return 0;
+}
diff --git a/src/cc/qcdio/QCThread.cc b/src/cc/qcdio/QCThread.cc
index bc6c3e6d5..61b340ab8 100644
--- a/src/cc/qcdio/QCThread.cc
+++ b/src/cc/qcdio/QCThread.cc
@@ -125,12 +125,7 @@ QCThread::~QCThread()
     QCThread::Join();
 }
 
-const int kMinThreadStackSize =
-#ifdef PTHREAD_STACK_MIN
-    PTHREAD_STACK_MIN + (4 << 10);
-#else
-    (8 << 10);
-#endif
+const int kMinThreadStackSize = 256 << 10;
 
     int
 QCThread::TryToStart(
diff --git a/src/cc/tools/qfsput_main.cc b/src/cc/tools/qfsput_main.cc
index 30a189be2..129c51cea 100644
--- a/src/cc/tools/qfsput_main.cc
+++ b/src/cc/tools/qfsput_main.cc
@@ -58,7 +58,7 @@ main(int argc, char **argv)
     const char* config         = 0;
     ssize_t     numBytes;
 
-    while ((optchar = getopt(argc, argv, "hs:p:f:v")) != -1) {
+    while ((optchar = getopt(argc, argv, "hs:p:f:c:v")) != -1) {
         switch (optchar) {
             case 'f':
                 kfspathname = optarg;
@@ -105,9 +105,7 @@ main(int argc, char **argv)
     }
 
     numBytes = doPut(kfspathname);
-    if (numBytes <= 0) {
-        cout << "Wrote " << numBytes << " to " << kfspathname << "\n";
-    }
+    cout << "Wrote " << numBytes << " to " << kfspathname << "\n";
     delete gKfsClient;
 
     return (numBytes < 0 ? 1 : 0);
@@ -124,12 +122,13 @@ doPut(const string &filename)
         cout << "Create failed: " << ErrorCodeToStr(fd) << "\n";
         return fd;
     }
-    while(cin.read(dataBuf, sizeof(dataBuf))) {
+    while(cin.read(dataBuf, sizeof(dataBuf)) || cin.gcount() > 0) {
         const size_t cnt = cin.gcount();
         const int    res = gKfsClient->Write(fd, dataBuf, cnt);
         if (res != (int)cnt) {
             cout << "Write failed...expect to write: " << cnt <<
                 " but only wrote: " << res << "\n";
+            gKfsClient->Close(fd);
             return -1;
         }
         bytesWritten += res;