Update *.pl and add download-files.sh
This commit is contained in:
parent
6c9c024b0b
commit
3d77d6cdd3
|
@ -1,2 +1,4 @@
|
|||
marc.pl
|
||||
other/
|
||||
*.xml
|
||||
*.iso2709
|
||||
|
|
16
build.pl
16
build.pl
|
@ -3,6 +3,8 @@
|
|||
use utf8;
|
||||
binmode(STDOUT, ":utf8");
|
||||
|
||||
my @chars = ("A".."Z", "a".."z", "1".."9", "_");
|
||||
|
||||
$/ = "</dublin_core>\n"; # record separator
|
||||
|
||||
$what = 1000; # dummy id for when there’s no file
|
||||
|
@ -35,8 +37,10 @@ while (<>) {
|
|||
#$cul =~ s@.*(folders/|d/)([^?&/]+).*@\2@;
|
||||
$cul =~ s@.*(d/)([^?&/]+).*@$2@;
|
||||
$tac = "https://drive.google.com/uc?export=download&id=$cul";
|
||||
print "$tac\n";
|
||||
print "\n";
|
||||
} elsif ($cul =~ /pdf/ && $cul =~ /usi/) {
|
||||
$tac = $cul;
|
||||
$cul = '';
|
||||
$cul .= $chars[rand @chars] for 1..33;
|
||||
}
|
||||
$path = '';
|
||||
$id = $what++;
|
||||
|
@ -57,14 +61,13 @@ while (<>) {
|
|||
print DC $_;
|
||||
close DC;
|
||||
|
||||
# assuming we have a file ...
|
||||
if ($tac) {
|
||||
print "$tac\n";
|
||||
print "$cul\n";
|
||||
|
||||
my $duplicated = `grep -o \"$url\" $file | wc -l`;
|
||||
if ($duplicated > 1) {
|
||||
print "DUPLICATED";
|
||||
print "DUPLICATED\n";
|
||||
next;
|
||||
}
|
||||
|
||||
|
@ -72,7 +75,7 @@ while (<>) {
|
|||
#system "curl -s -L \"$tac\" -o import/$id/$cul";
|
||||
|
||||
my $output = `file import/$id/$cul`;
|
||||
print $output;
|
||||
print "$output\n";
|
||||
|
||||
$ext = '';
|
||||
|
||||
|
@ -98,4 +101,7 @@ while (<>) {
|
|||
}
|
||||
}
|
||||
|
||||
print "eliminamos $id\n";
|
||||
system "rm -rf import/$id";
|
||||
|
||||
__END__
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
#!/bin/sh
|
||||
|
||||
cd "$1"
|
||||
|
||||
start="$2"
|
||||
end="$3"
|
||||
for i in $(seq $start $end); do
|
||||
cd "$i"
|
||||
cat dublin_core.xml | grep -Eo "(http|https)://[a-zA-Z0-9./?=_%&;:-]*" | sort -u | while read url; do
|
||||
filename="$(cat /dev/urandom | gtr -dc 'a-zA-Z0-9_' | fold -w 32 | head -n 1)"
|
||||
|
||||
echo "$url"
|
||||
if echo "$url" | grep -q -e "file\|docs"; then
|
||||
id="$(echo "$url" | sed -r 's@.*(d/)([^?&/]+).*@\2@')"
|
||||
url="https://drive.google.com/uc?export=download&id=$id"
|
||||
fi
|
||||
|
||||
curl -L "$url" -o "$filename"
|
||||
|
||||
ext=""
|
||||
output="$(file $filename)"
|
||||
if echo $output | grep -q PDF; then
|
||||
ext="pdf"
|
||||
mv $filename $filename.pdf
|
||||
fi
|
||||
if echo $output | grep -q Word; then
|
||||
soffice --headless --convert-to pdf $filename;
|
||||
mv $filename $filename.docx;
|
||||
ext="word"
|
||||
fi
|
||||
|
||||
if [ "$ext" = 'pdf' ] || [ "$ext" = 'docx' ]; then
|
||||
printf "$filename.pdf" | tee -a contents
|
||||
[ $ext = 'docx' ] && printf "$filename.docx" | tee -a contents
|
||||
fi
|
||||
done
|
||||
cd ..
|
||||
done
|
|
@ -40,6 +40,12 @@ while (my $blob = <>) { # suck in one MARC record at a time
|
|||
$element = 'contributor';
|
||||
$qualifier = 'author';
|
||||
$content =~ s/[0-9]//g;
|
||||
$content =~ s/comp\.//g;
|
||||
$content =~ s/col\.//g;
|
||||
$content =~ s/dir\.//g;
|
||||
$content =~ s/pról\.//g;
|
||||
$content =~ s/coord\.//g;
|
||||
$content =~ s/ed\.//g;
|
||||
$content =~ s/^\s+|\s+$//g;
|
||||
}
|
||||
if ($element eq 'format') {
|
||||
|
|
Loading…
Reference in New Issue