#!/usr/bin/perl -w use utf8; binmode(STDOUT, ":utf8"); my @chars = ("A".."Z", "a".."z", "1".."9", "_"); $/ = "\n"; $what = 1000; $file = $ARGV[0]; mkdir "import", 0755; while (<>) { $cul = ""; $tac = ""; $url = ""; if (!(defined $_ and length $_ > 0)) { exit; } s/\n//; s/<\/collection>\n//; if (/\s*(.*?)\s*<\/dcvalue>\n/) { $cul = $1; $url = $1; if ($cul =~ /docs/ || $cul =~ /file/) { $cul =~ s@.*(d/)([^?&/]+).*@$2@; $tac = "https://drive.google.com/uc?export=download&id=$cul"; } elsif ($cul =~ /pdf/ && $cul =~ /usi/) { $tac = $cul; $cul = ''; $cul .= $chars[rand @chars] for 1..33; } $path = ''; $id = $what++; } else { $path = ''; $id = $what++; } print "$path/$id\n"; mkdir "import/$id", 0755; open DC, ">import/$id/dublin_core.xml" or die "Cannot open dublin core for $id, $!\n"; print DC $_; close DC; if ($tac) { print "$tac\n"; print "$cul\n"; my $duplicated = `grep -o \"$url\" $file | wc -l`; if ($duplicated > 1) { print "DUPLICATED\n"; next; } system "curl -L \"$tac\" -o import/$id/$cul"; my $output = `file import/$id/$cul`; print "$output\n"; $ext = ''; if ($output =~ /PDF/) { $ext = "pdf"; system "mv import/$id/$cul import/$id/$cul.pdf"; } if ($output =~ /Word/) { system "soffice --headless --convert-to pdf import/$id/$cul"; $ext = "docx"; system "mv import/$id/$cul import/$id/$cul.docx"; system "mv $cul.pdf import/$id/$cul.pdf"; } if ($ext eq 'pdf' || $ext eq 'docx') { open OUT, ">import/$id/contents" or die "Cannot open contents for $id, $!\n"; print OUT "$cul.pdf"; print OUT "\n$cul.docx" if $ext eq 'docx'; close OUT; } } } system "rm -rf import/$id"; __END__