Commit 15662b3e8644905032c2e26808401a487d4e90c1

Authored by Joe Perches
Committed by Linus Torvalds
1 parent 67d0a07544

checkpatch: add a --strict check for utf-8 in commit logs

Some find using utf-8 in commit logs inappropriate.

Some patch commit logs contain unintended utf-8 characters when doing
things like copy/pasting compilation output.

Look for the start of any commit log by skipping initial lines that look
like email headers and "From: " lines.

Stop looking for utf-8 at the first signature line.

Signed-off-by: Joe Perches <joe@perches.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 26 additions and 4 deletions Side-by-side Diff

scripts/checkpatch.pl
... ... @@ -240,9 +240,8 @@
240 240 our $Type;
241 241 our $Declare;
242 242  
243   -our $UTF8 = qr {
244   - [\x09\x0A\x0D\x20-\x7E] # ASCII
245   - | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
  243 +our $NON_ASCII_UTF8 = qr{
  244 + [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
246 245 | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
247 246 | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
248 247 | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
... ... @@ -251,6 +250,11 @@
251 250 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
252 251 }x;
253 252  
  253 +our $UTF8 = qr{
  254 + [\x09\x0A\x0D\x20-\x7E] # ASCII
  255 + | $NON_ASCII_UTF8
  256 +}x;
  257 +
254 258 our $typeTypedefs = qr{(?x:
255 259 (?:__)?(?:u|s|be|le)(?:8|16|32|64)|
256 260 atomic_t
... ... @@ -1330,6 +1334,9 @@
1330 1334 my $signoff = 0;
1331 1335 my $is_patch = 0;
1332 1336  
  1337 + my $in_header_lines = 1;
  1338 + my $in_commit_log = 0; #Scanning lines before patch
  1339 +
1333 1340 our @report = ();
1334 1341 our $cnt_lines = 0;
1335 1342 our $cnt_error = 0;
... ... @@ -1497,7 +1504,6 @@
1497 1504 if ($line =~ /^diff --git.*?(\S+)$/) {
1498 1505 $realfile = $1;
1499 1506 $realfile =~ s@^([^/]*)/@@;
1500   -
1501 1507 } elsif ($line =~ /^\+\+\+\s+(\S+)/) {
1502 1508 $realfile = $1;
1503 1509 $realfile =~ s@^([^/]*)/@@;
... ... @@ -1536,6 +1542,7 @@
1536 1542 # Check the patch for a signoff:
1537 1543 if ($line =~ /^\s*signed-off-by:/i) {
1538 1544 $signoff++;
  1545 + $in_commit_log = 0;
1539 1546 }
1540 1547  
1541 1548 # Check signature styles
... ... @@ -1611,6 +1618,21 @@
1611 1618  
1612 1619 CHK("INVALID_UTF8",
1613 1620 "Invalid UTF-8, patch and commit message should be encoded in UTF-8\n" . $hereptr);
  1621 + }
  1622 +
  1623 +# Check if it's the start of a commit log
  1624 +# (not a header line and we haven't seen the patch filename)
  1625 + if ($in_header_lines && $realfile =~ /^$/ &&
  1626 + $rawline !~ /^(commit\b|from\b|\w+:).+$/i) {
  1627 + $in_header_lines = 0;
  1628 + $in_commit_log = 1;
  1629 + }
  1630 +
  1631 +# Still not yet in a patch, check for any UTF-8
  1632 + if ($in_commit_log && $realfile =~ /^$/ &&
  1633 + $rawline =~ /$NON_ASCII_UTF8/) {
  1634 + CHK("UTF8_BEFORE_PATCH",
  1635 + "8-bit UTF-8 used in possible commit log\n" . $herecurr);
1614 1636 }
1615 1637  
1616 1638 # ignore non-hunk lines and lines being removed