koha-to-dspace/build.pl

102 lines
2.2 KiB
Perl
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/perl -w
use utf8;
binmode(STDOUT, ":utf8");
$/ = "</dublin_core>\n"; # record separator
$what = 1000; # dummy id for when theres no file
$file = $ARGV[0];
mkdir "import", 0755;
while (<>) {
$cul = "";
$tac = "";
$url = "";
if (!(defined $_ and length $_ > 0)) {
exit;
}
# discard the top and bottom tags
s/<collection>\n//;
s/<\/collection>\n//;
# extract the file path from the identifier
# use the file name as an id
# note that identifier element is discarded!
#if (s!<dcvalue element="relation" qualifier="uri">\s*(.*?)\s*<\/dcvalue>\n!!s) {
if (/<dcvalue element="relation" qualifier="uri">\s*(.*?)\s*<\/dcvalue>\n/) {
$cul = $1;
$url = $1;
if ($cul =~ /docs/ || $cul =~ /file/) {
#$cul =~ s@.*(folders/|d/)([^?&/]+).*@\2@;
$cul =~ s@.*(d/)([^?&/]+).*@$2@;
$tac = "https://drive.google.com/uc?export=download&id=$cul";
print "$tac\n";
print "\n";
}
$path = '';
$id = $what++;
} else {
$path = '';
$id = $what++;
}
# let the operator know where were up to
print "$path/$id\n";
# create the item directory
mkdir "import/$id", 0755;
# create the dublin_core.xml file
open DC, ">import/$id/dublin_core.xml"
or die "Cannot open dublin core for $id, $!\n";
print DC $_;
close DC;
# assuming we have a file ...
if ($tac) {
print "$tac\n";
print "$cul\n";
my $duplicated = `grep -o \"$url\" $file | wc -l`;
if ($duplicated > 1) {
print "DUPLICATED";
next;
}
system "curl -L \"$tac\" -o import/$id/$cul";
#system "curl -s -L \"$tac\" -o import/$id/$cul";
my $output = `file import/$id/$cul`;
print $output;
$ext = '';
if ($output =~ /PDF/) {
$ext = "pdf";
system "mv import/$id/$cul import/$id/$cul.pdf";
}
if ($output =~ /Word/) {
system "soffice --headless --convert-to pdf import/$id/$cul";
$ext = "docx";
system "mv import/$id/$cul import/$id/$cul.docx";
system "mv $cul.pdf import/$id/$cul.pdf";
}
if ($ext eq 'pdf' || $ext eq 'docx') {
# ... create the contents file ...
open OUT, ">import/$id/contents"
or die "Cannot open contents for $id, $!\n";
print OUT "$cul.pdf";
print OUT "\n$cul.docx" if $ext eq 'docx';
close OUT;
}
}
}
__END__