#!/usr/bin/perl
# configure how many pages to check for valid text
my $check_pages = 5;
use strict;
use CAM::PDF;
use File::Find::Rule;
if (not defined $ARGV[0]) {
die
"Format: check_ocr [-q] [-r] [-y] [-n] [directory] or [filename.pdf] [filename.pdf] ...
If directory rather than filename is specified (e.g. "." for current directory), check_ocr will spider through all files in the directory.
If -r is also specified, check_ocr will also spider recursively through all files under the specified directory.
If -q is specified, result will be provided silently in exit code; only first file will be processed.
If -y is specified, only files with OCR will be displayed.
If -n is specified, only files without OCR will be displayed.
";
}
my $silent_mode = grep(/^-q$/i,@ARGV);
my $recursive_mode = grep(/^-r$/i,@ARGV);
my $yes_mode = grep(/^-y$/i,@ARGV);
my $no_mode = grep(/^-n$/i,@ARGV);
$yes_mode = $no_mode = 0 if ($yes_mode && $no_mode);
my @files;
foreach my $x (@ARGV) {
next if ($x =~ m/^-.$/);
if (-d $x) {
if ($recursive_mode) {
push @files, File::Find::Rule->file()
->name( qr/\.pdf$/i )
->in( $x );
} else {
push @files, File::Find::Rule->maxdepth(1)->file()
->name( qr/\.pdf$/i )
->in( $x );
}
} elsif (-f $x) {
push @files, $x;
} else {
print STDERR "Could not find input file $x.\n";
}
}
foreach my $input_file (@files) {
my $pdf = CAM::PDF->new($input_file);
if (not $pdf) {
print STDERR "$input_file does not appear to be a valid PDF or is not readable.\n";
next;
}
my $cur_page = 1;
my $max_pages = $pdf->numPages();
my $ocr_not_detected = 1;
while (($cur_page <= $max_pages) && ($cur_page <= $check_pages) && $ocr_not_detected) {
$ocr_not_detected = (length($pdf->getPageText($cur_page++)) < 3);
}
exit $ocr_not_detected if $silent_mode;
print ($ocr_not_detected ? "No : " : "Yes : ") unless ($no_mode || $yes_mode);
print $input_file . "\n" if (($ocr_not_detected && $no_mode ) || ($yes_mode &! $ocr_not_detected) |! ($yes_mode || $no_mode));
}
syntax highlighted by Code2HTML, v. 0.9.1