#!/usr/bin/env perl
use strict;
use warnings;

use Encode;
use Encode::Guess qw/shiftjis/;

open my $files, 'git diff --staged --numstat --diff-filter=ACM |' or die $!;
my $retval = 0;
for my $line (<$files>) {
  chomp $line;
  my ($added, $deleted, $path) = split /\s+/, $line, 3;
  if ($added eq '-') {
    # binary file
    next;
  }

  open my $fh, $path or next; # ignore open error
  binmode $fh;
  my $data = do { local $/;  <$fh>; };
  close $fh;

  # check ascii or utf8
  my $decoder = Encode::Guess->guess($data);
  next unless ref $decoder; # ignore guess error
  my $name = $decoder->name;
  if ($name ne 'utf8' and $name ne 'ascii') {
    print "$path: is NOT utf8 but $name\n";
    $retval++;
    next;
  }

  # check BOM
  my $decoded = $decoder->decode($data);
  if ($decoded =~ /\x{feff}/) {
    print "$path: contains BOM\n";
    $retval++;
    next;
  }

  # check CRLF
  if ($decoded =~ /\r\n/ms) {
    print "$path: contains CRLF\n";
    $retval++;
    next;
  }
}
close $files;

if ($retval) {
  print 'Try tools/toutf8unix.pl $file... to convert to suitable format', "\n";
}
exit $retval;
