diff --git a/.github/workflows/generate-help-tables.yml b/.github/workflows/generate-help-tables.yml index cb911ffe3..95c115e83 100644 --- a/.github/workflows/generate-help-tables.yml +++ b/.github/workflows/generate-help-tables.yml @@ -44,10 +44,11 @@ jobs: - name: Check server compatibility run: | - docker run --rm -v $PWD/help-tables:/docker-entrypoint-initdb.d mariadb:11.4 \ - bash -c '(echo -e "create database mysql;\nuse mysql;"; \ - cat /usr/share/mariadb/mariadb_system_tables.sql /docker-entrypoint-initdb.d/fill_help_tables.sql) \ - | /usr/sbin/mariadbd --bootstrap' + docker run --rm -v "$PWD/help-tables:/help-tables" mariadb:11.4 sh -c ' + { printf "create database mysql;\nuse mysql;\n"; + cat /usr/share/mariadb/mariadb_system_tables.sql /help-tables/fill_help_tables.sql; + } | mariadbd --bootstrap + ' - name: Upload artifact uses: actions/upload-artifact@v4 diff --git a/help-tables/markdown_extractor.py b/help-tables/markdown_extractor.py index c2ca1d50c..711b81360 100644 --- a/help-tables/markdown_extractor.py +++ b/help-tables/markdown_extractor.py @@ -325,12 +325,8 @@ def strip_markdown(text: str) -> str: return text.strip() -def truncate_to_bytes(text: str, max_bytes: int = 60000) -> str: - """Truncate a string so its UTF-8 encoding fits within max_bytes. - - MariaDB's help_topic.description column is TEXT (max 65,535 bytes). - We use 60,000 as a conservative limit to leave headroom. - """ +def truncate_to_bytes(text: str, max_bytes: int = 15000) -> str: + """Truncate to fit MAX_BOOTSTRAP_QUERY_SIZE (20000) after escape + boilerplate.""" encoded = text.encode('utf8') if len(encoded) <= max_bytes: return text @@ -370,12 +366,6 @@ def build_output(name, syntax: str, desc: str, example: list, path: str): parts.append(f"Examples\n--------\n\n{example_str}") desc_str = "\n\n".join(parts) if parts else "" desc_str = strip_markdown(desc_str) - # `path` comes from get_files() which walks REPO_ROOT/server/reference as - # an absolute path, so each entry is an absolute filesystem path (in CI: - # /home/runner/work/mariadb-docs/mariadb-docs/...). Make it relative to - # REPO_ROOT before tacking it onto the docs URL — otherwise we ship URLs - # like https://mariadb.com/docs//home/runner/... (broken) instead of - # https://mariadb.com/docs/server/reference/... (right). url_path = str(Path(path).resolve().relative_to(REPO_ROOT)).removesuffix(".md") url = f"https://mariadb.com/docs/{url_path}" desc_str += f"\n\nURL: {url}" @@ -402,7 +392,7 @@ def escape_sql(text): """ text = text.replace("\\", "\\\\") # \ → \\ text = text.replace("'", "''") # ' → '' (SQL standard escaping) - text = text.replace("\n", "\\n") # newline → literal \n for single-line INSERT + text = text.replace("\n", "\\n") # bootstrap parser is line-naïve; keep INSERT on one line return text def generate_insert(name, description, example, help_topic_id, url, help_category_id): diff --git a/help-tables/validate_sql.py b/help-tables/validate_sql.py index e2957e01d..b33e36ad6 100644 --- a/help-tables/validate_sql.py +++ b/help-tables/validate_sql.py @@ -15,77 +15,103 @@ # 1 — one or more errors found +def split_statements(text: str): + """Yield (start_line, statement) pairs, respecting SQL string and -- comment state.""" + statements = [] + buf = [] + line_no = 1 + start_line = 1 + in_str = False + i = 0 + n = len(text) + while i < n: + c = text[i] + if not in_str and c == "-" and i + 1 < n and text[i + 1] == "-": + while i < n and text[i] != "\n": + buf.append(text[i]) + i += 1 + continue + if c == "'": + if in_str and i + 1 < n and text[i + 1] == "'": + buf.append("''") + i += 2 + continue + in_str = not in_str + buf.append(c) + i += 1 + continue + if c == "\n": + line_no += 1 + if c == ";" and not in_str: + buf.append(c) + stmt = "".join(buf).strip() + if stmt: + statements.append((start_line, stmt)) + buf = [] + i += 1 + while i < n and text[i] in " \t\r\n": + if text[i] == "\n": + line_no += 1 + i += 1 + start_line = line_no + continue + buf.append(c) + i += 1 + return statements + + def validate_sql(file_path: str = "fill_help_tables.sql") -> bool: """Validate the generated fill_help_tables.sql file.""" errors = [] warnings = [] - # --- Step 1: Read the file --- - # Fail immediately if the file doesn't exist or is empty. try: with open(file_path, 'r', encoding='utf-8') as f: - lines = f.readlines() + text = f.read() except FileNotFoundError: print(f"FATAL: {file_path} not found") return False - if not lines: + if not text.strip(): print(f"FATAL: {file_path} is empty") return False - # --- Step 2: Classify every line into buckets --- - # Each line must be one of: preamble (comments/setup), or an INSERT into - # one of the four help tables. Any unrecognised line is flagged as an error. - # This ensures the file only contains expected content and isn't partially - # corrupted or truncated. - topic_lines = [] - category_lines = [] - keyword_lines = [] - relation_lines = [] - preamble_lines = [] - - for i, raw in enumerate(lines, 1): - line = raw.rstrip('\n') - - if not line or line.startswith('--'): - # Empty lines and SQL comments are expected preamble - preamble_lines.append((i, line)) - elif line.lower().startswith('set ') or line.lower().startswith('use ') or line.lower().startswith('delete '): - # Setup statements: charset, database selection, table clears - preamble_lines.append((i, line)) - elif line.startswith('INSERT INTO help_topic'): - topic_lines.append((i, line)) - elif line.lower().startswith('insert into help_category'): - category_lines.append((i, line)) - elif line.lower().startswith('insert into help_keyword'): - keyword_lines.append((i, line)) - elif line.lower().startswith('insert into help_relation'): - relation_lines.append((i, line)) + topic_stmts = [] + category_stmts = [] + keyword_stmts = [] + relation_stmts = [] + + for line_num, stmt in split_statements(text): + head = stmt[:30].lower() + if head.startswith('--') or head.startswith('set ') or head.startswith('use ') or head.startswith('delete '): + continue + if head.startswith('insert into help_topic'): + topic_stmts.append((line_num, stmt)) + elif head.startswith('insert into help_category'): + category_stmts.append((line_num, stmt)) + elif head.startswith('insert into help_keyword'): + keyword_stmts.append((line_num, stmt)) + elif head.startswith('insert into help_relation'): + relation_stmts.append((line_num, stmt)) else: - errors.append(f"Line {i}: Unrecognized line: {line[:80]}") + errors.append(f"Line {line_num}: Unrecognized statement: {stmt[:80]}") - # --- Step 3: Validate help_topic entries --- - # Each topic INSERT must: - # - End with ); (single-line format assumed — the generator writes one INSERT per line) - # - Have a unique help_topic_id (primary key) - # - Have a non-empty name (warn on duplicate names — MariaDB's column has a unique key) seen_topic_ids = set() seen_names = set() - for line_num, line in topic_lines: - if not line.endswith(");"): - errors.append(f"Line {line_num}: help_topic not terminated with );") + for line_num, stmt in topic_stmts: + size = len(stmt.encode('utf-8')) + if size > 20000: + errors.append(f"Line {line_num}: help_topic statement is {size} bytes (>20000 bootstrap limit)") - # Extract the first numeric value in VALUES(...) as the topic ID - id_match = re.search(r'VALUES \((\d+),', line) + id_match = re.match(r'INSERT INTO help_topic[^V]*VALUES\s*\((\d+),', stmt, re.IGNORECASE) if id_match: topic_id = int(id_match.group(1)) if topic_id in seen_topic_ids: errors.append(f"Line {line_num}: Duplicate help_topic_id {topic_id}") seen_topic_ids.add(topic_id) - # Extract the topic name (third column in the INSERT) - name_match = re.search(r"VALUES \(\d+, \d+, '([^']*(?:''[^']*)*)'", line) + name_match = re.match(r"INSERT INTO help_topic[^V]*VALUES\s*\(\d+,\s*\d+,\s*'([^']*(?:''[^']*)*)'", stmt, re.IGNORECASE) if name_match: name = name_match.group(1) if not name or name.isspace(): @@ -94,27 +120,17 @@ def validate_sql(file_path: str = "fill_help_tables.sql") -> bool: warnings.append(f"Line {line_num}: Duplicate name '{name}'") seen_names.add(name) - # --- Step 4: Validate help_keyword entries --- - # Each keyword INSERT must end with ); and have a unique help_keyword_id. seen_keyword_ids = set() - for line_num, line in keyword_lines: - if not line.endswith(");"): - errors.append(f"Line {line_num}: help_keyword not terminated with );") - id_match = re.search(r'values\s*\((\d+),', line, re.IGNORECASE) + for line_num, stmt in keyword_stmts: + id_match = re.search(r'values\s*\((\d+),', stmt, re.IGNORECASE) if id_match: kid = int(id_match.group(1)) if kid in seen_keyword_ids: errors.append(f"Line {line_num}: Duplicate help_keyword_id {kid}") seen_keyword_ids.add(kid) - # --- Step 5: Validate help_relation entries (referential integrity) --- - # Each relation maps a topic_id to a keyword_id. Both IDs must have been - # seen in the earlier topic/keyword INSERTs, otherwise the relation points - # to a non-existent row (which would fail at load time in MariaDB). - for line_num, line in relation_lines: - if not line.endswith(");"): - errors.append(f"Line {line_num}: help_relation not terminated with );") - rel_match = re.search(r'values\s*\((\d+),(\d+)\)', line, re.IGNORECASE) + for line_num, stmt in relation_stmts: + rel_match = re.search(r'values\s*\((\d+),\s*(\d+)\)', stmt, re.IGNORECASE) if rel_match: tid = int(rel_match.group(1)) kid = int(rel_match.group(2)) @@ -123,23 +139,18 @@ def validate_sql(file_path: str = "fill_help_tables.sql") -> bool: if kid not in seen_keyword_ids: errors.append(f"Line {line_num}: help_relation references unknown keyword_id {kid}") - # --- Step 6: Sanity-check minimum counts --- - # If the generator silently failed (e.g. wrong source path, all files - # excluded), the output could be technically valid but near-empty. - # These thresholds catch that case and fail CI before bad SQL ships. - if len(topic_lines) < 500: - errors.append(f"Only {len(topic_lines)} topics (expected 500+)") - if len(category_lines) < 10: - errors.append(f"Only {len(category_lines)} categories (expected 10+)") - if len(keyword_lines) < 100: - errors.append(f"Only {len(keyword_lines)} keywords (expected 100+)") - - # --- Step 7: Print report and return result --- + if len(topic_stmts) < 500: + errors.append(f"Only {len(topic_stmts)} topics (expected 500+)") + if len(category_stmts) < 10: + errors.append(f"Only {len(category_stmts)} categories (expected 10+)") + if len(keyword_stmts) < 100: + errors.append(f"Only {len(keyword_stmts)} keywords (expected 100+)") + print("=== SQL Validation Report ===") - print(f" Categories: {len(category_lines)}") - print(f" Topics: {len(topic_lines)} (unique IDs: {len(seen_topic_ids)}, unique names: {len(seen_names)})") - print(f" Keywords: {len(keyword_lines)} (unique IDs: {len(seen_keyword_ids)})") - print(f" Relations: {len(relation_lines)}") + print(f" Categories: {len(category_stmts)}") + print(f" Topics: {len(topic_stmts)} (unique IDs: {len(seen_topic_ids)}, unique names: {len(seen_names)})") + print(f" Keywords: {len(keyword_stmts)} (unique IDs: {len(seen_keyword_ids)})") + print(f" Relations: {len(relation_stmts)}") print() if warnings: