Commit 15662b3e8644905032c2e26808401a487d4e90c1
Committed by
Linus Torvalds
1 parent
67d0a07544
Exists in
master
and in
6 other branches
checkpatch: add a --strict check for utf-8 in commit logs
Some find using utf-8 in commit logs inappropriate. Some patch commit logs contain unintended utf-8 characters when doing things like copy/pasting compilation output. Look for the start of any commit log by skipping initial lines that look like email headers and "From: " lines. Stop looking for utf-8 at the first signature line. Signed-off-by: Joe Perches <joe@perches.com> Suggested-by: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Whitcroft <apw@shadowen.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 26 additions and 4 deletions Side-by-side Diff
scripts/checkpatch.pl
... | ... | @@ -240,9 +240,8 @@ |
240 | 240 | our $Type; |
241 | 241 | our $Declare; |
242 | 242 | |
243 | -our $UTF8 = qr { | |
244 | - [\x09\x0A\x0D\x20-\x7E] # ASCII | |
245 | - | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | |
243 | +our $NON_ASCII_UTF8 = qr{ | |
244 | + [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | |
246 | 245 | | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs |
247 | 246 | | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte |
248 | 247 | | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates |
... | ... | @@ -251,6 +250,11 @@ |
251 | 250 | | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 |
252 | 251 | }x; |
253 | 252 | |
253 | +our $UTF8 = qr{ | |
254 | + [\x09\x0A\x0D\x20-\x7E] # ASCII | |
255 | + | $NON_ASCII_UTF8 | |
256 | +}x; | |
257 | + | |
254 | 258 | our $typeTypedefs = qr{(?x: |
255 | 259 | (?:__)?(?:u|s|be|le)(?:8|16|32|64)| |
256 | 260 | atomic_t |
... | ... | @@ -1330,6 +1334,9 @@ |
1330 | 1334 | my $signoff = 0; |
1331 | 1335 | my $is_patch = 0; |
1332 | 1336 | |
1337 | + my $in_header_lines = 1; | |
1338 | + my $in_commit_log = 0; #Scanning lines before patch | |
1339 | + | |
1333 | 1340 | our @report = (); |
1334 | 1341 | our $cnt_lines = 0; |
1335 | 1342 | our $cnt_error = 0; |
... | ... | @@ -1497,7 +1504,6 @@ |
1497 | 1504 | if ($line =~ /^diff --git.*?(\S+)$/) { |
1498 | 1505 | $realfile = $1; |
1499 | 1506 | $realfile =~ s@^([^/]*)/@@; |
1500 | - | |
1501 | 1507 | } elsif ($line =~ /^\+\+\+\s+(\S+)/) { |
1502 | 1508 | $realfile = $1; |
1503 | 1509 | $realfile =~ s@^([^/]*)/@@; |
... | ... | @@ -1536,6 +1542,7 @@ |
1536 | 1542 | # Check the patch for a signoff: |
1537 | 1543 | if ($line =~ /^\s*signed-off-by:/i) { |
1538 | 1544 | $signoff++; |
1545 | + $in_commit_log = 0; | |
1539 | 1546 | } |
1540 | 1547 | |
1541 | 1548 | # Check signature styles |
... | ... | @@ -1611,6 +1618,21 @@ |
1611 | 1618 | |
1612 | 1619 | CHK("INVALID_UTF8", |
1613 | 1620 | "Invalid UTF-8, patch and commit message should be encoded in UTF-8\n" . $hereptr); |
1621 | + } | |
1622 | + | |
1623 | +# Check if it's the start of a commit log | |
1624 | +# (not a header line and we haven't seen the patch filename) | |
1625 | + if ($in_header_lines && $realfile =~ /^$/ && | |
1626 | + $rawline !~ /^(commit\b|from\b|\w+:).+$/i) { | |
1627 | + $in_header_lines = 0; | |
1628 | + $in_commit_log = 1; | |
1629 | + } | |
1630 | + | |
1631 | +# Still not yet in a patch, check for any UTF-8 | |
1632 | + if ($in_commit_log && $realfile =~ /^$/ && | |
1633 | + $rawline =~ /$NON_ASCII_UTF8/) { | |
1634 | + CHK("UTF8_BEFORE_PATCH", | |
1635 | + "8-bit UTF-8 used in possible commit log\n" . $herecurr); | |
1614 | 1636 | } |
1615 | 1637 | |
1616 | 1638 | # ignore non-hunk lines and lines being removed |