check_ocr

#!/usr/bin/perl

# configure how many pages to check for valid text
my $check_pages = 5;

use strict;
use CAM::PDF;
use File::Find::Rule;

if (not defined $ARGV[0]) {
   die 
"Format: check_ocr [-q] [-r] [-y] [-n] [directory] or [filename.pdf] [filename.pdf] ...

If directory rather than filename is specified (e.g. "." for current directory), check_ocr will spider through all files in the directory.
If -r is also specified, check_ocr will also spider recursively through all files under the specified directory.
If -q is specified, result will be provided silently in exit code; only first file will be processed.
If -y is specified, only files with OCR will be displayed.
If -n is specified, only files without OCR will be displayed.
";
}

my $silent_mode      = grep(/^-q$/i,@ARGV);
my $recursive_mode   = grep(/^-r$/i,@ARGV);
my $yes_mode         = grep(/^-y$/i,@ARGV);
my $no_mode          = grep(/^-n$/i,@ARGV);
$yes_mode = $no_mode = 0 if ($yes_mode && $no_mode);
my @files;

foreach my $x (@ARGV) {
  next if ($x =~ m/^-.$/);
  if (-d $x) {
    if ($recursive_mode) {
    push @files, File::Find::Rule->file()
                                 ->name( qr/\.pdf$/i )
                                 ->in( $x );
    } else {
      push @files, File::Find::Rule->maxdepth(1)->file()
                                   ->name( qr/\.pdf$/i )
                                   ->in( $x );
    }
  } elsif (-f $x) {
    push @files, $x;
  } else {
    print STDERR "Could not find input file $x.\n";
  }
}

foreach my $input_file (@files) {
  my $pdf              = CAM::PDF->new($input_file);

  if (not $pdf) {
     print STDERR "$input_file does not appear to be a valid PDF or is not readable.\n";
     next;
  }

  my $cur_page         = 1;
  my $max_pages        = $pdf->numPages();
  my $ocr_not_detected = 1;
  while (($cur_page <= $max_pages) && ($cur_page <= $check_pages) && $ocr_not_detected) {
    $ocr_not_detected = (length($pdf->getPageText($cur_page++)) < 3);
  }
  
  exit $ocr_not_detected if $silent_mode;
  print ($ocr_not_detected ? "No  : " : "Yes : ") unless ($no_mode || $yes_mode);
  print $input_file . "\n" if (($ocr_not_detected && $no_mode ) || ($yes_mode &! $ocr_not_detected) |! ($yes_mode || $no_mode));
}
syntax highlighted by Code2HTML, v. 0.9.1