Update *.pl and add download-files.sh

This commit is contained in:
Santiago Lo Coco 2023-07-23 22:58:07 +02:00
parent 6c9c024b0b
commit 3d77d6cdd3
4 changed files with 57 additions and 5 deletions

2
.gitignore vendored
View File

@ -1,2 +1,4 @@
marc.pl
other/
*.xml
*.iso2709

View File

@ -3,6 +3,8 @@
use utf8;
binmode(STDOUT, ":utf8");
my @chars = ("A".."Z", "a".."z", "1".."9", "_");
$/ = "</dublin_core>\n"; # record separator
$what = 1000; # dummy id for when theres no file
@ -35,8 +37,10 @@ while (<>) {
#$cul =~ s@.*(folders/|d/)([^?&/]+).*@\2@;
$cul =~ s@.*(d/)([^?&/]+).*@$2@;
$tac = "https://drive.google.com/uc?export=download&id=$cul";
print "$tac\n";
print "\n";
} elsif ($cul =~ /pdf/ && $cul =~ /usi/) {
$tac = $cul;
$cul = '';
$cul .= $chars[rand @chars] for 1..33;
}
$path = '';
$id = $what++;
@ -57,14 +61,13 @@ while (<>) {
print DC $_;
close DC;
# assuming we have a file ...
if ($tac) {
print "$tac\n";
print "$cul\n";
my $duplicated = `grep -o \"$url\" $file | wc -l`;
if ($duplicated > 1) {
print "DUPLICATED";
print "DUPLICATED\n";
next;
}
@ -72,7 +75,7 @@ while (<>) {
#system "curl -s -L \"$tac\" -o import/$id/$cul";
my $output = `file import/$id/$cul`;
print $output;
print "$output\n";
$ext = '';
@ -98,4 +101,7 @@ while (<>) {
}
}
print "eliminamos $id\n";
system "rm -rf import/$id";
__END__

38
download-files.sh Normal file
View File

@ -0,0 +1,38 @@
#!/bin/sh
cd "$1"
start="$2"
end="$3"
for i in $(seq $start $end); do
cd "$i"
cat dublin_core.xml | grep -Eo "(http|https)://[a-zA-Z0-9./?=_%&;:-]*" | sort -u | while read url; do
filename="$(cat /dev/urandom | gtr -dc 'a-zA-Z0-9_' | fold -w 32 | head -n 1)"
echo "$url"
if echo "$url" | grep -q -e "file\|docs"; then
id="$(echo "$url" | sed -r 's@.*(d/)([^?&/]+).*@\2@')"
url="https://drive.google.com/uc?export=download&id=$id"
fi
curl -L "$url" -o "$filename"
ext=""
output="$(file $filename)"
if echo $output | grep -q PDF; then
ext="pdf"
mv $filename $filename.pdf
fi
if echo $output | grep -q Word; then
soffice --headless --convert-to pdf $filename;
mv $filename $filename.docx;
ext="word"
fi
if [ "$ext" = 'pdf' ] || [ "$ext" = 'docx' ]; then
printf "$filename.pdf" | tee -a contents
[ $ext = 'docx' ] && printf "$filename.docx" | tee -a contents
fi
done
cd ..
done

View File

@ -40,6 +40,12 @@ while (my $blob = <>) { # suck in one MARC record at a time
$element = 'contributor';
$qualifier = 'author';
$content =~ s/[0-9]//g;
$content =~ s/comp\.//g;
$content =~ s/col\.//g;
$content =~ s/dir\.//g;
$content =~ s/pról\.//g;
$content =~ s/coord\.//g;
$content =~ s/ed\.//g;
$content =~ s/^\s+|\s+$//g;
}
if ($element eq 'format') {