Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/workflows/generate-help-tables.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,11 @@ jobs:

- name: Check server compatibility
run: |
docker run --rm -v $PWD/help-tables:/docker-entrypoint-initdb.d mariadb:11.4 \
bash -c '(echo -e "create database mysql;\nuse mysql;"; \
cat /usr/share/mariadb/mariadb_system_tables.sql /docker-entrypoint-initdb.d/fill_help_tables.sql) \
| /usr/sbin/mariadbd --bootstrap'
docker run --rm -v "$PWD/help-tables:/help-tables" mariadb:11.4 sh -c '
{ printf "create database mysql;\nuse mysql;\n";
cat /usr/share/mariadb/mariadb_system_tables.sql /help-tables/fill_help_tables.sql;
} | mariadbd --bootstrap
'

- name: Upload artifact
uses: actions/upload-artifact@v4
Expand Down
16 changes: 3 additions & 13 deletions help-tables/markdown_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,12 +325,8 @@ def strip_markdown(text: str) -> str:
return text.strip()


def truncate_to_bytes(text: str, max_bytes: int = 60000) -> str:
"""Truncate a string so its UTF-8 encoding fits within max_bytes.

MariaDB's help_topic.description column is TEXT (max 65,535 bytes).
We use 60,000 as a conservative limit to leave headroom.
"""
def truncate_to_bytes(text: str, max_bytes: int = 15000) -> str:
"""Truncate to fit MAX_BOOTSTRAP_QUERY_SIZE (20000) after escape + boilerplate."""
encoded = text.encode('utf8')
if len(encoded) <= max_bytes:
return text
Expand Down Expand Up @@ -370,12 +366,6 @@ def build_output(name, syntax: str, desc: str, example: list, path: str):
parts.append(f"Examples\n--------\n\n{example_str}")
desc_str = "\n\n".join(parts) if parts else ""
desc_str = strip_markdown(desc_str)
# `path` comes from get_files() which walks REPO_ROOT/server/reference as
# an absolute path, so each entry is an absolute filesystem path (in CI:
# /home/runner/work/mariadb-docs/mariadb-docs/...). Make it relative to
# REPO_ROOT before tacking it onto the docs URL — otherwise we ship URLs
# like https://mariadb.com/docs//home/runner/... (broken) instead of
# https://mariadb.com/docs/server/reference/... (right).
url_path = str(Path(path).resolve().relative_to(REPO_ROOT)).removesuffix(".md")
url = f"https://mariadb.com/docs/{url_path}"
desc_str += f"\n\nURL: {url}"
Expand All @@ -402,7 +392,7 @@ def escape_sql(text):
"""
text = text.replace("\\", "\\\\") # \ → \\
text = text.replace("'", "''") # ' → '' (SQL standard escaping)
text = text.replace("\n", "\\n") # newline → literal \n for single-line INSERT
text = text.replace("\n", "\\n") # bootstrap parser is line-naïve; keep INSERT on one line
return text

def generate_insert(name, description, example, help_topic_id, url, help_category_id):
Expand Down
161 changes: 86 additions & 75 deletions help-tables/validate_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,77 +15,103 @@
# 1 — one or more errors found


def split_statements(text: str):
"""Yield (start_line, statement) pairs, respecting SQL string and -- comment state."""
statements = []
buf = []
line_no = 1
start_line = 1
in_str = False
i = 0
n = len(text)
while i < n:
c = text[i]
if not in_str and c == "-" and i + 1 < n and text[i + 1] == "-":
while i < n and text[i] != "\n":
buf.append(text[i])
i += 1
continue
if c == "'":
if in_str and i + 1 < n and text[i + 1] == "'":
buf.append("''")
i += 2
continue
in_str = not in_str
buf.append(c)
i += 1
continue
if c == "\n":
line_no += 1
if c == ";" and not in_str:
buf.append(c)
stmt = "".join(buf).strip()
if stmt:
statements.append((start_line, stmt))
buf = []
i += 1
while i < n and text[i] in " \t\r\n":
if text[i] == "\n":
line_no += 1
i += 1
start_line = line_no
continue
buf.append(c)
i += 1
return statements


def validate_sql(file_path: str = "fill_help_tables.sql") -> bool:
"""Validate the generated fill_help_tables.sql file."""
errors = []
warnings = []

# --- Step 1: Read the file ---
# Fail immediately if the file doesn't exist or is empty.
try:
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
text = f.read()
except FileNotFoundError:
print(f"FATAL: {file_path} not found")
return False

if not lines:
if not text.strip():
print(f"FATAL: {file_path} is empty")
return False

# --- Step 2: Classify every line into buckets ---
# Each line must be one of: preamble (comments/setup), or an INSERT into
# one of the four help tables. Any unrecognised line is flagged as an error.
# This ensures the file only contains expected content and isn't partially
# corrupted or truncated.
topic_lines = []
category_lines = []
keyword_lines = []
relation_lines = []
preamble_lines = []

for i, raw in enumerate(lines, 1):
line = raw.rstrip('\n')

if not line or line.startswith('--'):
# Empty lines and SQL comments are expected preamble
preamble_lines.append((i, line))
elif line.lower().startswith('set ') or line.lower().startswith('use ') or line.lower().startswith('delete '):
# Setup statements: charset, database selection, table clears
preamble_lines.append((i, line))
elif line.startswith('INSERT INTO help_topic'):
topic_lines.append((i, line))
elif line.lower().startswith('insert into help_category'):
category_lines.append((i, line))
elif line.lower().startswith('insert into help_keyword'):
keyword_lines.append((i, line))
elif line.lower().startswith('insert into help_relation'):
relation_lines.append((i, line))
topic_stmts = []
category_stmts = []
keyword_stmts = []
relation_stmts = []

for line_num, stmt in split_statements(text):
head = stmt[:30].lower()
if head.startswith('--') or head.startswith('set ') or head.startswith('use ') or head.startswith('delete '):
continue
if head.startswith('insert into help_topic'):
topic_stmts.append((line_num, stmt))
elif head.startswith('insert into help_category'):
category_stmts.append((line_num, stmt))
elif head.startswith('insert into help_keyword'):
keyword_stmts.append((line_num, stmt))
elif head.startswith('insert into help_relation'):
relation_stmts.append((line_num, stmt))
else:
errors.append(f"Line {i}: Unrecognized line: {line[:80]}")
errors.append(f"Line {line_num}: Unrecognized statement: {stmt[:80]}")

# --- Step 3: Validate help_topic entries ---
# Each topic INSERT must:
# - End with ); (single-line format assumed — the generator writes one INSERT per line)
# - Have a unique help_topic_id (primary key)
# - Have a non-empty name (warn on duplicate names — MariaDB's column has a unique key)
seen_topic_ids = set()
seen_names = set()

for line_num, line in topic_lines:
if not line.endswith(");"):
errors.append(f"Line {line_num}: help_topic not terminated with );")
for line_num, stmt in topic_stmts:
size = len(stmt.encode('utf-8'))
if size > 20000:
errors.append(f"Line {line_num}: help_topic statement is {size} bytes (>20000 bootstrap limit)")

# Extract the first numeric value in VALUES(...) as the topic ID
id_match = re.search(r'VALUES \((\d+),', line)
id_match = re.match(r'INSERT INTO help_topic[^V]*VALUES\s*\((\d+),', stmt, re.IGNORECASE)
if id_match:
topic_id = int(id_match.group(1))
if topic_id in seen_topic_ids:
errors.append(f"Line {line_num}: Duplicate help_topic_id {topic_id}")
seen_topic_ids.add(topic_id)

# Extract the topic name (third column in the INSERT)
name_match = re.search(r"VALUES \(\d+, \d+, '([^']*(?:''[^']*)*)'", line)
name_match = re.match(r"INSERT INTO help_topic[^V]*VALUES\s*\(\d+,\s*\d+,\s*'([^']*(?:''[^']*)*)'", stmt, re.IGNORECASE)
if name_match:
name = name_match.group(1)
if not name or name.isspace():
Expand All @@ -94,27 +120,17 @@ def validate_sql(file_path: str = "fill_help_tables.sql") -> bool:
warnings.append(f"Line {line_num}: Duplicate name '{name}'")
seen_names.add(name)

# --- Step 4: Validate help_keyword entries ---
# Each keyword INSERT must end with ); and have a unique help_keyword_id.
seen_keyword_ids = set()
for line_num, line in keyword_lines:
if not line.endswith(");"):
errors.append(f"Line {line_num}: help_keyword not terminated with );")
id_match = re.search(r'values\s*\((\d+),', line, re.IGNORECASE)
for line_num, stmt in keyword_stmts:
id_match = re.search(r'values\s*\((\d+),', stmt, re.IGNORECASE)
if id_match:
kid = int(id_match.group(1))
if kid in seen_keyword_ids:
errors.append(f"Line {line_num}: Duplicate help_keyword_id {kid}")
seen_keyword_ids.add(kid)

# --- Step 5: Validate help_relation entries (referential integrity) ---
# Each relation maps a topic_id to a keyword_id. Both IDs must have been
# seen in the earlier topic/keyword INSERTs, otherwise the relation points
# to a non-existent row (which would fail at load time in MariaDB).
for line_num, line in relation_lines:
if not line.endswith(");"):
errors.append(f"Line {line_num}: help_relation not terminated with );")
rel_match = re.search(r'values\s*\((\d+),(\d+)\)', line, re.IGNORECASE)
for line_num, stmt in relation_stmts:
rel_match = re.search(r'values\s*\((\d+),\s*(\d+)\)', stmt, re.IGNORECASE)
if rel_match:
tid = int(rel_match.group(1))
kid = int(rel_match.group(2))
Expand All @@ -123,23 +139,18 @@ def validate_sql(file_path: str = "fill_help_tables.sql") -> bool:
if kid not in seen_keyword_ids:
errors.append(f"Line {line_num}: help_relation references unknown keyword_id {kid}")

# --- Step 6: Sanity-check minimum counts ---
# If the generator silently failed (e.g. wrong source path, all files
# excluded), the output could be technically valid but near-empty.
# These thresholds catch that case and fail CI before bad SQL ships.
if len(topic_lines) < 500:
errors.append(f"Only {len(topic_lines)} topics (expected 500+)")
if len(category_lines) < 10:
errors.append(f"Only {len(category_lines)} categories (expected 10+)")
if len(keyword_lines) < 100:
errors.append(f"Only {len(keyword_lines)} keywords (expected 100+)")

# --- Step 7: Print report and return result ---
if len(topic_stmts) < 500:
errors.append(f"Only {len(topic_stmts)} topics (expected 500+)")
if len(category_stmts) < 10:
errors.append(f"Only {len(category_stmts)} categories (expected 10+)")
if len(keyword_stmts) < 100:
errors.append(f"Only {len(keyword_stmts)} keywords (expected 100+)")

print("=== SQL Validation Report ===")
print(f" Categories: {len(category_lines)}")
print(f" Topics: {len(topic_lines)} (unique IDs: {len(seen_topic_ids)}, unique names: {len(seen_names)})")
print(f" Keywords: {len(keyword_lines)} (unique IDs: {len(seen_keyword_ids)})")
print(f" Relations: {len(relation_lines)}")
print(f" Categories: {len(category_stmts)}")
print(f" Topics: {len(topic_stmts)} (unique IDs: {len(seen_topic_ids)}, unique names: {len(seen_names)})")
print(f" Keywords: {len(keyword_stmts)} (unique IDs: {len(seen_keyword_ids)})")
print(f" Relations: {len(relation_stmts)}")
print()

if warnings:
Expand Down
Loading