#!/usr/bin/perl -w use utf8; binmode(STDOUT, ":utf8"); $/ = "\n"; # record separator $what = 1000; # dummy id for when there’s no file while (<>) { $cul = ""; $tac = ""; # discard the top and bottom tags s/\n//; s/<\/collection>\n//; # extract the file path from the identifier # use the file name as an id # note that identifier element is discarded! #if (s!\s*(.*?)\s*<\/dcvalue>\n!!s) { if (/\s*(.*?)\s*<\/dcvalue>\n/) { $cul = $1; if ($cul =~ /docs/) { #$cul =~ s@.*(folders/|d/)([^?&/]+).*@\2@; $cul =~ s@.*(d/)([^?&/]+).*@$2@; $tac = "https://drive.google.com/uc?export=download&id=$cul"; print "$tac\n"; print "\n"; } $path = ''; $id = $what++; } else { $path = ''; $id = $what++; } # let the operator know where we’re up to print "$path/$id\n"; # create the item directory mkdir "import/$id", 0755; # create the dublin_core.xml file open DC, ">import/$id/dublin_core.xml" or die "Cannot open dublin core for $id, $!\n"; print DC $_; close DC; # assuming we have a file ... if ($tac) { print "$tac\n"; print "$cul\n"; system "curl -L \"$tac\" -o import/$id/$cul"; #system "curl -s -L \"$tac\" -o import/$id/$cul"; my $output = `file import/$id/$cul`; print $output; if ($output =~ /PDF/) { $ext = "pdf"; } if ($output =~ /Word/) { system "soffice --headless --convert-to pdf import/$id/$cul"; $ext = "pdf"; } system "mv import/$id/$cul import/$id/$cul.$ext"; # ... create the contents file ... open OUT, ">import/$id/contents" or die "Cannot open contents for $id, $!\n"; print OUT "$cul.$ext"; close OUT; } } __END__