mariadb-corporation · mariadb-ElijahOduba · May 14, 2026
diff --git a/.github/workflows/generate-help-tables.yml b/.github/workflows/generate-help-tables.yml
@@ -44,10 +44,11 @@ jobs:
 
       - name: Check server compatibility
         run: |
-          docker run --rm -v $PWD/help-tables:/docker-entrypoint-initdb.d mariadb:11.4 \
-            bash -c '(echo -e "create database mysql;\nuse mysql;"; \
-                      cat /usr/share/mariadb/mariadb_system_tables.sql /docker-entrypoint-initdb.d/fill_help_tables.sql) \
-                      | /usr/sbin/mariadbd --bootstrap'
+          docker run --rm -v "$PWD/help-tables:/help-tables" mariadb:11.4 sh -c '
+            { printf "create database mysql;\nuse mysql;\n";
+              cat /usr/share/mariadb/mariadb_system_tables.sql /help-tables/fill_help_tables.sql;
+            } | mariadbd --bootstrap
+          '
 
       - name: Upload artifact
         uses: actions/upload-artifact@v4

diff --git a/help-tables/markdown_extractor.py b/help-tables/markdown_extractor.py
@@ -325,12 +325,8 @@ def strip_markdown(text: str) -> str:
     return text.strip()
 
 
-def truncate_to_bytes(text: str, max_bytes: int = 60000) -> str:
-    """Truncate a string so its UTF-8 encoding fits within max_bytes.
-
-    MariaDB's help_topic.description column is TEXT (max 65,535 bytes).
-    We use 60,000 as a conservative limit to leave headroom.
-    """
+def truncate_to_bytes(text: str, max_bytes: int = 15000) -> str:
+    """Truncate to fit MAX_BOOTSTRAP_QUERY_SIZE (20000) after escape + boilerplate."""
     encoded = text.encode('utf8')
     if len(encoded) <= max_bytes:
         return text
@@ -370,12 +366,6 @@ def build_output(name, syntax: str, desc: str, example: list, path: str):
         parts.append(f"Examples\n--------\n\n{example_str}")
     desc_str = "\n\n".join(parts) if parts else ""
     desc_str = strip_markdown(desc_str)
-    # `path` comes from get_files() which walks REPO_ROOT/server/reference as
-    # an absolute path, so each entry is an absolute filesystem path (in CI:
-    # /home/runner/work/mariadb-docs/mariadb-docs/...). Make it relative to
-    # REPO_ROOT before tacking it onto the docs URL — otherwise we ship URLs
-    # like https://mariadb.com/docs//home/runner/... (broken) instead of
-    # https://mariadb.com/docs/server/reference/... (right).
     url_path = str(Path(path).resolve().relative_to(REPO_ROOT)).removesuffix(".md")
     url = f"https://mariadb.com/docs/{url_path}"
     desc_str += f"\n\nURL: {url}"
@@ -402,7 +392,7 @@ def escape_sql(text):
     """
     text = text.replace("\\", "\\\\")  # \ → \\
     text = text.replace("'", "''")      # ' → '' (SQL standard escaping)
-    text = text.replace("\n", "\\n")   # newline → literal \n for single-line INSERT
+    text = text.replace("\n", "\\n")   # bootstrap parser is line-naïve; keep INSERT on one line
     return text
 
 def generate_insert(name, description, example, help_topic_id, url, help_category_id):

diff --git a/help-tables/validate_sql.py b/help-tables/validate_sql.py
@@ -15,77 +15,103 @@
 #   1 — one or more errors found
 
 
+def split_statements(text: str):
+    """Yield (start_line, statement) pairs, respecting SQL string and -- comment state."""
+    statements = []
+    buf = []
+    line_no = 1
+    start_line = 1
+    in_str = False
+    i = 0
+    n = len(text)
+    while i < n:
+        c = text[i]
+        if not in_str and c == "-" and i + 1 < n and text[i + 1] == "-":
+            while i < n and text[i] != "\n":
+                buf.append(text[i])
+                i += 1
+            continue
+        if c == "'":
+            if in_str and i + 1 < n and text[i + 1] == "'":
+                buf.append("''")
+                i += 2
+                continue
+            in_str = not in_str
+            buf.append(c)
+            i += 1
+            continue
+        if c == "\n":
+            line_no += 1
+        if c == ";" and not in_str:
+            buf.append(c)
+            stmt = "".join(buf).strip()
+            if stmt:
+                statements.append((start_line, stmt))
+            buf = []
+            i += 1
+            while i < n and text[i] in " \t\r\n":
+                if text[i] == "\n":
+                    line_no += 1
+                i += 1
+            start_line = line_no
+            continue
+        buf.append(c)
+        i += 1
+    return statements
+
+
 def validate_sql(file_path: str = "fill_help_tables.sql") -> bool:
     """Validate the generated fill_help_tables.sql file."""
     errors = []
     warnings = []
 
-    # --- Step 1: Read the file ---
-    # Fail immediately if the file doesn't exist or is empty.
     try:
         with open(file_path, 'r', encoding='utf-8') as f:
-            lines = f.readlines()
+            text = f.read()
     except FileNotFoundError:
         print(f"FATAL: {file_path} not found")
         return False
 
-    if not lines:
+    if not text.strip():
         print(f"FATAL: {file_path} is empty")
         return False
 
-    # --- Step 2: Classify every line into buckets ---
-    # Each line must be one of: preamble (comments/setup), or an INSERT into
-    # one of the four help tables. Any unrecognised line is flagged as an error.
-    # This ensures the file only contains expected content and isn't partially
-    # corrupted or truncated.
-    topic_lines = []
-    category_lines = []
-    keyword_lines = []
-    relation_lines = []
-    preamble_lines = []
-
-    for i, raw in enumerate(lines, 1):
-        line = raw.rstrip('\n')
-
-        if not line or line.startswith('--'):
-            # Empty lines and SQL comments are expected preamble
-            preamble_lines.append((i, line))
-        elif line.lower().startswith('set ') or line.lower().startswith('use ') or line.lower().startswith('delete '):
-            # Setup statements: charset, database selection, table clears
-            preamble_lines.append((i, line))
-        elif line.startswith('INSERT INTO help_topic'):
-            topic_lines.append((i, line))
-        elif line.lower().startswith('insert into help_category'):
-            category_lines.append((i, line))
-        elif line.lower().startswith('insert into help_keyword'):
-            keyword_lines.append((i, line))
-        elif line.lower().startswith('insert into help_relation'):
-            relation_lines.append((i, line))
+    topic_stmts = []
+    category_stmts = []
+    keyword_stmts = []
+    relation_stmts = []
+
+    for line_num, stmt in split_statements(text):
+        head = stmt[:30].lower()
+        if head.startswith('--') or head.startswith('set ') or head.startswith('use ') or head.startswith('delete '):
+            continue
+        if head.startswith('insert into help_topic'):
+            topic_stmts.append((line_num, stmt))
+        elif head.startswith('insert into help_category'):
+            category_stmts.append((line_num, stmt))
+        elif head.startswith('insert into help_keyword'):
+            keyword_stmts.append((line_num, stmt))
+        elif head.startswith('insert into help_relation'):
+            relation_stmts.append((line_num, stmt))
         else:
-            errors.append(f"Line {i}: Unrecognized line: {line[:80]}")
+            errors.append(f"Line {line_num}: Unrecognized statement: {stmt[:80]}")
 
-    # --- Step 3: Validate help_topic entries ---
-    # Each topic INSERT must:
-    #   - End with ); (single-line format assumed — the generator writes one INSERT per line)
-    #   - Have a unique help_topic_id (primary key)
-    #   - Have a non-empty name (warn on duplicate names — MariaDB's column has a unique key)
     seen_topic_ids = set()
     seen_names = set()
 
-    for line_num, line in topic_lines:
-        if not line.endswith(");"):
-            errors.append(f"Line {line_num}: help_topic not terminated with );")
+    for line_num, stmt in topic_stmts:
+        size = len(stmt.encode('utf-8'))
+        if size > 20000:
+            errors.append(f"Line {line_num}: help_topic statement is {size} bytes (>20000 bootstrap limit)")
 
-        # Extract the first numeric value in VALUES(...) as the topic ID
-        id_match = re.search(r'VALUES \((\d+),', line)
+        id_match = re.match(r'INSERT INTO help_topic[^V]*VALUES\s*\((\d+),', stmt, re.IGNORECASE)
         if id_match:
             topic_id = int(id_match.group(1))
             if topic_id in seen_topic_ids:
                 errors.append(f"Line {line_num}: Duplicate help_topic_id {topic_id}")
             seen_topic_ids.add(topic_id)
 
-        # Extract the topic name (third column in the INSERT)
-        name_match = re.search(r"VALUES \(\d+, \d+, '([^']*(?:''[^']*)*)'", line)
+        name_match = re.match(r"INSERT INTO help_topic[^V]*VALUES\s*\(\d+,\s*\d+,\s*'([^']*(?:''[^']*)*)'", stmt, re.IGNORECASE)
         if name_match:
             name = name_match.group(1)
             if not name or name.isspace():
@@ -94,27 +120,17 @@ def validate_sql(file_path: str = "fill_help_tables.sql") -> bool:
                 warnings.append(f"Line {line_num}: Duplicate name '{name}'")
             seen_names.add(name)
 
-    # --- Step 4: Validate help_keyword entries ---
-    # Each keyword INSERT must end with ); and have a unique help_keyword_id.
     seen_keyword_ids = set()
-    for line_num, line in keyword_lines:
-        if not line.endswith(");"):
-            errors.append(f"Line {line_num}: help_keyword not terminated with );")
-        id_match = re.search(r'values\s*\((\d+),', line, re.IGNORECASE)
+    for line_num, stmt in keyword_stmts:
+        id_match = re.search(r'values\s*\((\d+),', stmt, re.IGNORECASE)
         if id_match:
             kid = int(id_match.group(1))
             if kid in seen_keyword_ids:
                 errors.append(f"Line {line_num}: Duplicate help_keyword_id {kid}")
             seen_keyword_ids.add(kid)
 
-    # --- Step 5: Validate help_relation entries (referential integrity) ---
-    # Each relation maps a topic_id to a keyword_id. Both IDs must have been
-    # seen in the earlier topic/keyword INSERTs, otherwise the relation points
-    # to a non-existent row (which would fail at load time in MariaDB).
-    for line_num, line in relation_lines:
-        if not line.endswith(");"):
-            errors.append(f"Line {line_num}: help_relation not terminated with );")
-        rel_match = re.search(r'values\s*\((\d+),(\d+)\)', line, re.IGNORECASE)
+    for line_num, stmt in relation_stmts:
+        rel_match = re.search(r'values\s*\((\d+),\s*(\d+)\)', stmt, re.IGNORECASE)
         if rel_match:
             tid = int(rel_match.group(1))
             kid = int(rel_match.group(2))
@@ -123,23 +139,18 @@ def validate_sql(file_path: str = "fill_help_tables.sql") -> bool:
             if kid not in seen_keyword_ids:
                 errors.append(f"Line {line_num}: help_relation references unknown keyword_id {kid}")
 
-    # --- Step 6: Sanity-check minimum counts ---
-    # If the generator silently failed (e.g. wrong source path, all files
-    # excluded), the output could be technically valid but near-empty.
-    # These thresholds catch that case and fail CI before bad SQL ships.
-    if len(topic_lines) < 500:
-        errors.append(f"Only {len(topic_lines)} topics (expected 500+)")
-    if len(category_lines) < 10:
-        errors.append(f"Only {len(category_lines)} categories (expected 10+)")
-    if len(keyword_lines) < 100:
-        errors.append(f"Only {len(keyword_lines)} keywords (expected 100+)")
-
-    # --- Step 7: Print report and return result ---
+    if len(topic_stmts) < 500:
+        errors.append(f"Only {len(topic_stmts)} topics (expected 500+)")
+    if len(category_stmts) < 10:
+        errors.append(f"Only {len(category_stmts)} categories (expected 10+)")
+    if len(keyword_stmts) < 100:
+        errors.append(f"Only {len(keyword_stmts)} keywords (expected 100+)")
+
     print("=== SQL Validation Report ===")
-    print(f"  Categories: {len(category_lines)}")
-    print(f"  Topics:     {len(topic_lines)}  (unique IDs: {len(seen_topic_ids)}, unique names: {len(seen_names)})")
-    print(f"  Keywords:   {len(keyword_lines)}  (unique IDs: {len(seen_keyword_ids)})")
-    print(f"  Relations:  {len(relation_lines)}")
+    print(f"  Categories: {len(category_stmts)}")
+    print(f"  Topics:     {len(topic_stmts)}  (unique IDs: {len(seen_topic_ids)}, unique names: {len(seen_names)})")
+    print(f"  Keywords:   {len(keyword_stmts)}  (unique IDs: {len(seen_keyword_ids)})")
+    print(f"  Relations:  {len(relation_stmts)}")
     print()
 
     if warnings: