koha-to-dspace/build.pl

96 lines
1.8 KiB
Perl
Executable File

#!/usr/bin/perl -w
use utf8;
binmode(STDOUT, ":utf8");
my @chars = ("A".."Z", "a".."z", "1".."9", "_");
$/ = "</dublin_core>\n";
$what = 1000;
$file = $ARGV[0];
mkdir "import", 0755;
while (<>) {
$cul = "";
$tac = "";
$url = "";
if (!(defined $_ and length $_ > 0)) {
exit;
}
s/<collection>\n//;
s/<\/collection>\n//;
if (/<dcvalue element="relation" qualifier="uri">\s*(.*?)\s*<\/dcvalue>\n/) {
$cul = $1;
$url = $1;
if ($cul =~ /docs/ || $cul =~ /file/) {
$cul =~ s@.*(d/)([^?&/]+).*@$2@;
$tac = "https://drive.google.com/uc?export=download&id=$cul";
} elsif ($cul =~ /pdf/ && $cul =~ /usi/) {
$tac = $cul;
$cul = '';
$cul .= $chars[rand @chars] for 1..33;
}
$path = '';
$id = $what++;
} else {
$path = '';
$id = $what++;
}
print "$path/$id\n";
mkdir "import/$id", 0755;
open DC, ">import/$id/dublin_core.xml"
or die "Cannot open dublin core for $id, $!\n";
print DC $_;
close DC;
if ($tac) {
print "$tac\n";
print "$cul\n";
my $duplicated = `grep -o \"$url\" $file | wc -l`;
if ($duplicated > 1) {
print "DUPLICATED\n";
next;
}
system "curl -L \"$tac\" -o import/$id/$cul";
my $output = `file import/$id/$cul`;
print "$output\n";
$ext = '';
if ($output =~ /PDF/) {
$ext = "pdf";
system "mv import/$id/$cul import/$id/$cul.pdf";
}
if ($output =~ /Word/) {
system "soffice --headless --convert-to pdf import/$id/$cul";
$ext = "docx";
system "mv import/$id/$cul import/$id/$cul.docx";
system "mv $cul.pdf import/$id/$cul.pdf";
}
if ($ext eq 'pdf' || $ext eq 'docx') {
open OUT, ">import/$id/contents"
or die "Cannot open contents for $id, $!\n";
print OUT "$cul.pdf";
print OUT "\n$cul.docx" if $ext eq 'docx';
close OUT;
}
}
}
system "rm -rf import/$id";
__END__