#!/usr/bin/perl -w use utf8; binmode(STDOUT, ":utf8"); $/ = "\n"; # record separator $what = 1000; # dummy id for when there’s no file $file = $ARGV[0]; mkdir "import", 0755; while (<>) { $cul = ""; $tac = ""; $url = ""; if (!(defined $_ and length $_ > 0)) { exit; } # discard the top and bottom tags s/\n//; s/<\/collection>\n//; # extract the file path from the identifier # use the file name as an id # note that identifier element is discarded! #if (s!\s*(.*?)\s*<\/dcvalue>\n!!s) { if (/\s*(.*?)\s*<\/dcvalue>\n/) { $cul = $1; $url = $1; if ($cul =~ /docs/ || $cul =~ /file/) { #$cul =~ s@.*(folders/|d/)([^?&/]+).*@\2@; $cul =~ s@.*(d/)([^?&/]+).*@$2@; $tac = "https://drive.google.com/uc?export=download&id=$cul"; print "$tac\n"; print "\n"; } $path = ''; $id = $what++; } else { $path = ''; $id = $what++; } # let the operator know where we’re up to print "$path/$id\n"; # create the item directory mkdir "import/$id", 0755; # create the dublin_core.xml file open DC, ">import/$id/dublin_core.xml" or die "Cannot open dublin core for $id, $!\n"; print DC $_; close DC; # assuming we have a file ... if ($tac) { print "$tac\n"; print "$cul\n"; my $duplicated = `grep -o \"$url\" $file | wc -l`; if ($duplicated > 1) { print "DUPLICATED"; next; } system "curl -L \"$tac\" -o import/$id/$cul"; #system "curl -s -L \"$tac\" -o import/$id/$cul"; my $output = `file import/$id/$cul`; print $output; $ext = ''; if ($output =~ /PDF/) { $ext = "pdf"; system "mv import/$id/$cul import/$id/$cul.pdf"; } if ($output =~ /Word/) { system "soffice --headless --convert-to pdf import/$id/$cul"; $ext = "docx"; system "mv import/$id/$cul import/$id/$cul.docx"; system "mv $cul.pdf import/$id/$cul.pdf"; } if ($ext eq 'pdf' || $ext eq 'docx') { # ... create the contents file ... open OUT, ">import/$id/contents" or die "Cannot open contents for $id, $!\n"; print OUT "$cul.pdf"; print OUT "\n$cul.docx" if $ext eq 'docx'; close OUT; } } } __END__