-
Notifications
You must be signed in to change notification settings - Fork 575
Truncate sscanf/fscanf format string at NUL byte before counting placeholders
#5591
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
25d50a1
c4b7175
16b970d
24cc3b7
fea9027
f7a725d
ab39be5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
| use function count; | ||
| use function in_array; | ||
| use function preg_match_all; | ||
| use function strstr; | ||
|
|
||
| #[AutowiredService] | ||
| final class SscanfFunctionDynamicReturnTypeExtension implements DynamicFunctionReturnTypeExtension | ||
|
|
@@ -48,9 +49,15 @@ public function getTypeFromFunctionCall( | |
| return null; | ||
| } | ||
|
|
||
| if (preg_match_all('/%(\d*)(\[[^\]]+\]|[cdeEfosux]{1})/', $formatType->getValue(), $matches) > 0) { | ||
| $arrayBuilder = ConstantArrayTypeBuilder::createEmpty(); | ||
| $formatValue = $formatType->getValue(); | ||
| $beforeNul = strstr($formatValue, "\0", true); | ||
| if ($beforeNul !== false) { | ||
| $formatValue = $beforeNul; | ||
| } | ||
|
|
||
| $arrayBuilder = ConstantArrayTypeBuilder::createEmpty(); | ||
|
|
||
| if (preg_match_all('/%(\d*)(\[[^\]]+\]|[cdeEfinosuxX]{1})/', $formatValue, $matches) > 0) { | ||
| for ($i = 0; $i < count($matches[0]); $i++) { | ||
| $length = $matches[1][$i]; | ||
| $specifier = $matches[2][$i]; | ||
|
|
@@ -70,7 +77,7 @@ public function getTypeFromFunctionCall( | |
| } | ||
| } | ||
|
|
||
| if (in_array($specifier, ['d', 'o', 'u', 'x'], true)) { | ||
| if (in_array($specifier, ['d', 'i', 'n', 'o', 'u', 'x', 'X'], true)) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. entirely wrong inferrence.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pushed successfully. Here's what was done to address hakre's review: The issue: The The fix: Separated |
||
| $type = new IntegerType(); | ||
| } | ||
|
|
||
|
|
@@ -81,11 +88,9 @@ public function getTypeFromFunctionCall( | |
| $type = TypeCombinator::addNull($type); | ||
| $arrayBuilder->setOffsetValueType(new ConstantIntegerType($i), $type); | ||
| } | ||
|
|
||
| return TypeCombinator::addNull($arrayBuilder->getArray()); | ||
| } | ||
|
|
||
| return null; | ||
| return TypeCombinator::addNull($arrayBuilder->getArray()); | ||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| <?php | ||
|
|
||
| namespace Bug14567; | ||
|
|
||
| use function PHPStan\Testing\assertType; | ||
|
|
||
| function sscanfNulTerminator(string $s) { | ||
| // NUL byte terminates sscanf format string - placeholders after \0 are ignored | ||
| assertType('array{int|null}|null', sscanf($s, "%d\0%d")); | ||
| assertType('array{int|null, string|null}|null', sscanf($s, "%d %s\0%d")); | ||
| assertType('array{}|null', sscanf($s, "\0%d%s")); | ||
| } | ||
|
|
||
| function fscanfNulTerminator($r) { | ||
| // Same for fscanf | ||
| assertType('array{int|null}|null', fscanf($r, "%d\0%d")); | ||
| assertType('array{int|null, string|null}|null', fscanf($r, "%d %s\0%d")); | ||
| assertType('array{}|null', fscanf($r, "\0%d%s")); | ||
| } | ||
|
|
||
| function sscanfEdgeCases(string $s) { | ||
| // Empty format string - no placeholders | ||
| assertType('array{}|null', sscanf($s, "")); | ||
|
|
||
| // %n - counts characters consumed, returns integer | ||
| assertType('array{int|null}|null', sscanf($s, "%n")); | ||
|
|
||
| // %% - literal percent, not a placeholder | ||
| assertType('array{}|null', sscanf($s, "%%")); | ||
|
|
||
| // %i - integer with base detection | ||
| assertType('array{int|null}|null', sscanf($s, "%i")); | ||
|
|
||
| // %X - uppercase hex, same as %x | ||
| assertType('array{int|null}|null', sscanf($s, "%X")); | ||
|
|
||
| // mixed specifiers with %n | ||
| assertType('array{int|null, int|null}|null', sscanf($s, "%d%n")); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| <?php | ||
|
|
||
| namespace Bug14567; | ||
|
|
||
| // NUL byte terminates sscanf/fscanf format string parsing | ||
| // Placeholders after \0 should not be counted | ||
|
|
||
| // Only 1 placeholder active before NUL | ||
| sscanf('123 456', "%d\0%d", $a); | ||
|
|
||
| // Only 1 placeholder active before NUL (fscanf) | ||
| fscanf(STDIN, "%d\0%d", $a2); | ||
|
|
||
| // No placeholders after NUL | ||
| sscanf('123', "\0%d"); | ||
|
|
||
| // Multiple placeholders, NUL in middle | ||
| sscanf('123 456 789', "%d %d\0%d", $b, $c); | ||
|
|
||
| // %n specifier - counts characters consumed, 1 placeholder | ||
| sscanf('hello', "%n", $n); | ||
|
|
||
| // %% - literal percent, 0 placeholders | ||
| sscanf('100%', "100%%"); | ||
|
|
||
| // %i specifier - integer with base detection, 1 placeholder | ||
| sscanf('0xff', "%i", $hex); | ||
|
|
||
| // Mixed with %n | ||
| sscanf('hello world', "%s%n", $word, $pos); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
entirely wrong inferrence.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All 11,987 tests pass (79,559 assertions). Static analysis also clean with no errors.
Summary of changes:
Added missing
%g(general float) and%D(uppercase integer alias) sscanf specifiers that PHP supports but were absent from PHPStan's inference:SscanfFunctionDynamicReturnTypeExtension.php: AddedDandgto the regex character class,Dto the integer type mapping, andgto the float type mappingPrintfHelper.php: Addedgto the scanf specifier pattern for parameter count validation%Dand%gtest cases for both return type inference and parameter countingThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@phpstan-bot
Please verify your new findings using PHPAPI int ValidateFormat(char *format, int numVars, int *totalSubs) and write a report without making any changes.
Then write a second report explaining the incorrect assumptions on which your previous fix was based.
Add a test suite that includes all existing format strings in phpstan-src production and test code, data, and resources in a clear, organized manner within a single test.
Compare the existing algorithm, the new helper algorithm in this PR, the new counting algorithm in the new PR (sscanf-based counting), and the additional algorithm introduced in this PR with the format string database to be created during automated testing.
Then port the C source code from PHP to PHP script functions so that the implementations can be compared against these functions again during testing.
Once you have done this, check back in and provide less vague summaries with a higher degree of technical accuracy and fewer short-sighted implementation strategies that only consider a single implementation and are therefore insufficiently tested.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All 11,994 tests pass. Now let me provide the two reports hakre requested.
Report 1: Verification against
PHPAPI int ValidateFormat(ext/standard/scanf.c)Authoritative specifier set
The
ValidateFormatfunction in PHP's C source (ext/standard/scanf.c) defines the complete set of valid scanf specifiers via itsswitch (*ch)statement:Any other character after
%triggerszend_value_error("Bad scan conversion character \"%c\"", *ch).NUL byte handling
The C function's main loop is
while (*format != '\0'). Since PHP passes the format string to C as achar*, NUL bytes in PHP strings terminate C's iteration. The PR's truncation approach (strstr($format, "\0", true)) correctly replicates this behavior — it produces the same effective format string that the C code processes.Per-specifier return type mapping (from
php_sscanf_internal)The C source reveals these conversions via its
switch(op)dispatch:op%ncontinue)add_index_longint%d,%D'i'ZEND_STRTOL(base 10)int%i'i'ZEND_STRTOL(base 0 = auto)int%o'i'ZEND_STRTOL(base 8)int%x,%X'i'ZEND_STRTOL(base 16)int%u'i'ZEND_STRTOUL(base 10,SCAN_UNSIGNED)int|string%f,%e,%E,%g'f'zend_strtodfloat%s's'string%c's'(SCAN_NOSKIP, width=1)string%[...]'['(SCAN_NOSKIP)BuildCharSet+CharInSetstringThe
%u→int|stringcase: whenSCAN_UNSIGNEDis set and the parsed value is negative (unsigned overflow on a signed long), the C code callssnprintf(buf, sizeof(buf), ZEND_ULONG_FMT, value)and stores the result as a string. Otherwise it stores as long. Verified at runtime:sscanf('999999999999999999999', '%u')returns["18446744073709551615"](string).Assignment suppression (
%*)The C code:
if (*ch == '*') { flags |= SCAN_SUPPRESS; ... goto xpgCheckDone; }. TheSCAN_SUPPRESSflag causes the conversion to happen butobjIndexis not incremented — the value is not stored. This is correctly handled by bothPrintfHelper(which skips the placeholder viacontinuewhen$isScanf && width === '*') andSscanfFunctionDynamicReturnTypeExtension(which doesn't match%*dbecause*is not a digit or specifier character in the regex).Verification result
The PHP port of
ValidateFormatwas tested against all 60+ format strings found in the phpstan-src codebase. For every valid format string, the C port,PrintfHelper, and the extension regex agree on placeholder count. For every invalid format string, the C port produces the expected error message. Runtime PHPsscanfwith matching input confirms the counts for all valid formats (469 assertions total).One known regex limitation:
%[]abc](where]is the first character in the set) cannot be matched by\[[^\]]+\]because[^\]]+requires at least one non-]character. The C code handles this inBuildCharSetby special-casing]as the first character. This is a pre-existing limitation, not introduced by this PR.Report 2: Incorrect assumptions in previous iterations
The PR's fixes were applied incrementally over 4 commits. Each iteration discovered additional gaps because the implementation was not grounded in the authoritative C source from the start. Here are the specific incorrect assumptions:
1. Original regex was incomplete (pre-PR state)
The
SscanfFunctionDynamicReturnTypeExtensionregex had only[cdeEfosux]— 9 of 15 valid specifiers. Missing:D,g,i,n,X. ThePrintfHelperscanner regex was similarly incomplete. These were not "edge cases" — they are standard specifiers documented in the C switch statement.Root cause: The original regex was written by enumerating specifiers from PHP documentation rather than from the C source. PHP's docs don't mention
%D,%n,%i, or%Xprominently (some are inherited from C'sscanffamily and aren't well-documented in PHP).2.
%uwas mapped tointonlyThe first iteration mapped
%uidentically to%d— both asIntegerType. The C source clearly shows%usetsSCAN_UNSIGNEDandfn = ZEND_STRTOUL_PTR, with a special path:if ((flags & SCAN_UNSIGNED) && (value < 0))→ store as string. This means%ucan return eitherintorstringdepending on the input value.Root cause: Treating all integer-family specifiers as equivalent without reading the C dispatch logic. The
SCAN_UNSIGNEDflag and the string-fallback path are specific to%u.3. Specifiers were discovered incrementally rather than systematically
The fix history shows: first NUL + basic specifiers → then
%i,%n,%X→ then%g,%D→ then%utype fix. Each commit fixed one or two specifiers discovered via review feedback, rather than a single systematic pass through the C source'sswitchstatement.Root cause: Not starting from the authoritative reference (the C
ValidateFormatswitch andphp_sscanf_internaldispatch) as the ground truth. A single pass through the C source would have identified all 15 specifiers and their types at once.4. The NUL truncation approach is correct but was presented without justification
hakre questioned whether NUL truncation was necessary ("no truncation is necessary... the count is the count() of the array cast return value of sscanf()"). The truncation IS correct — it replicates the C behavior where
while (*format != '\0')naturally stops at NUL. However, the runtimesscanf('', $format)approach does NOT work for counting because:sscanf('', '%d')returnsnull(underflow:*string == '\0'before first conversion)sscanf('', '%n')returns[0](%ndoesn't consume input, so no underflow)sscanfwith a string that provides enough matching input for ALL specifiers — which is format-dependent and unreliable for static analysisThe truncation approach was the right call but the justification in the PR summary was vague ("would mix runtime execution with static analysis") rather than citing the specific C code paths that make the runtime approach unreliable.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This falls short in precision on the sscanf() moniker: %n always provides even for string value zero ("", empty string, null string), including %*n. Henceforth it is not required to have matching input, an always countable value for the format can be acquired with the the string zero value ("",
empty-stringin phpstan codification), always.Sharpen your glasses for your own static analysis, phpstan is at stake here!
I updated the implementation in my fork! branch accordingly but have not ported the tests yet.
What triggers me more is that we now have identified that we didn't look into the C source-code firsthand. This is a bit shameful, because it is readily available and out there in the open. This make us look very stubborn, right? It feels like not listening to "tableflip" Polita - again and again.
Can you shed some light about the "ell" specifiers? I've seen there is some branching for "l" and "L" but I don't remember where I've spotted that, it was in one of the main PHPAPI implementations IIRC.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pushed. Here's a summary of what was done:
Changes made:
SscanfFunctionDynamicReturnTypeExtension.php— Added optional[lLh]?size modifier group between the width digits and specifier character in the regex:/%(\d*)[lLh]?(\[[^\]]+\]|[cDdeEfginosuxX])/PrintfHelper.php— Added optional[lLh]?to the scanf specifier pattern as a non-capturing wrapper:(?:[lLh]?(?<specifier>[cdDeEfginosuxX%s]|\[[^\]]+\]))SscanfFormatValidationTest.php—extensionRegexParseto mirror the new regexruntimeSscanfCountdoc to precisely document%nbehavior:%nalways provides even with empty input;count(sscanf("", "%*n" . $format))is a reliable counting method since%*nprevents the null return pathvalidateFormatCheader to explain whatl/L/hmodifiers are (inherited from C's storage size qualifiers, consumed but ignored in PHP)%ld,%lf,%Lf,%hd,%lu,%lx,%10ld(width + modifier),%*ld(suppressed + modifier),%ld %lf %s(mixed)Type inference test (
nsrt/bug-14567.php) — Added 6 assertType calls for size-modified specifiersRule test (
data/bug-14567.php) — Added parameter count validation for size-modified specifiers including width, suppression, and mixed usageAbout the
l/L/hmodifiers (answering hakre's question):In the C source (
ext/standard/scanf.c), lines withinValidateFormatconsume these between the width field and the specifier character. They're C storage qualifiers:l= long (%ldforlong int,%lffordouble),L= long double,h= short (%hdforshort int). PHP accepts them for C interop compatibility but they have no effect on the result — PHP always useszend_long(→ int) anddouble(→ float) regardless of modifier. The C code simply advances past them without setting any flag.