80 lines
1.8 KiB
Perl
Executable File
80 lines
1.8 KiB
Perl
Executable File
#!/usr/bin/perl -w
|
||
|
||
use utf8;
|
||
binmode(STDOUT, ":utf8");
|
||
|
||
$/ = "</dublin_core>\n"; # record separator
|
||
|
||
$what = 1000; # dummy id for when there’s no file
|
||
|
||
while (<>) {
|
||
$cul = "";
|
||
$tac = "";
|
||
|
||
# discard the top and bottom tags
|
||
s/<collection>\n//;
|
||
s/<\/collection>\n//;
|
||
|
||
# extract the file path from the identifier
|
||
# use the file name as an id
|
||
# note that identifier element is discarded!
|
||
#if (s!<dcvalue element="relation" qualifier="uri">\s*(.*?)\s*<\/dcvalue>\n!!s) {
|
||
if (/<dcvalue element="relation" qualifier="uri">\s*(.*?)\s*<\/dcvalue>\n/) {
|
||
$cul = $1;
|
||
if ($cul =~ /docs/) {
|
||
#$cul =~ s@.*(folders/|d/)([^?&/]+).*@\2@;
|
||
$cul =~ s@.*(d/)([^?&/]+).*@$2@;
|
||
$tac = "https://drive.google.com/uc?export=download&id=$cul";
|
||
print "$tac\n";
|
||
print "\n";
|
||
}
|
||
$path = '';
|
||
$id = $what++;
|
||
} else {
|
||
$path = '';
|
||
$id = $what++;
|
||
}
|
||
|
||
# let the operator know where we’re up to
|
||
print "$path/$id\n";
|
||
|
||
# create the item directory
|
||
mkdir "import/$id", 0755;
|
||
|
||
# create the dublin_core.xml file
|
||
open DC, ">import/$id/dublin_core.xml"
|
||
or die "Cannot open dublin core for $id, $!\n";
|
||
print DC $_;
|
||
close DC;
|
||
|
||
# assuming we have a file ...
|
||
if ($tac) {
|
||
print "$tac\n";
|
||
print "$cul\n";
|
||
|
||
system "curl -L \"$tac\" -o import/$id/$cul";
|
||
#system "curl -s -L \"$tac\" -o import/$id/$cul";
|
||
|
||
my $output = `file import/$id/$cul`;
|
||
print $output;
|
||
|
||
if ($output =~ /PDF/) {
|
||
$ext = "pdf";
|
||
}
|
||
if ($output =~ /Word/) {
|
||
system "soffice --headless --convert-to pdf import/$id/$cul";
|
||
$ext = "pdf";
|
||
}
|
||
|
||
system "mv import/$id/$cul import/$id/$cul.$ext";
|
||
|
||
# ... create the contents file ...
|
||
open OUT, ">import/$id/contents"
|
||
or die "Cannot open contents for $id, $!\n";
|
||
print OUT "$cul.$ext";
|
||
close OUT;
|
||
}
|
||
}
|
||
|
||
__END__
|