Update *.pl and add download-files.sh
This commit is contained in:
parent
6c9c024b0b
commit
3d77d6cdd3
|
@ -1,2 +1,4 @@
|
||||||
marc.pl
|
marc.pl
|
||||||
other/
|
other/
|
||||||
|
*.xml
|
||||||
|
*.iso2709
|
||||||
|
|
16
build.pl
16
build.pl
|
@ -3,6 +3,8 @@
|
||||||
use utf8;
|
use utf8;
|
||||||
binmode(STDOUT, ":utf8");
|
binmode(STDOUT, ":utf8");
|
||||||
|
|
||||||
|
my @chars = ("A".."Z", "a".."z", "1".."9", "_");
|
||||||
|
|
||||||
$/ = "</dublin_core>\n"; # record separator
|
$/ = "</dublin_core>\n"; # record separator
|
||||||
|
|
||||||
$what = 1000; # dummy id for when there’s no file
|
$what = 1000; # dummy id for when there’s no file
|
||||||
|
@ -35,8 +37,10 @@ while (<>) {
|
||||||
#$cul =~ s@.*(folders/|d/)([^?&/]+).*@\2@;
|
#$cul =~ s@.*(folders/|d/)([^?&/]+).*@\2@;
|
||||||
$cul =~ s@.*(d/)([^?&/]+).*@$2@;
|
$cul =~ s@.*(d/)([^?&/]+).*@$2@;
|
||||||
$tac = "https://drive.google.com/uc?export=download&id=$cul";
|
$tac = "https://drive.google.com/uc?export=download&id=$cul";
|
||||||
print "$tac\n";
|
} elsif ($cul =~ /pdf/ && $cul =~ /usi/) {
|
||||||
print "\n";
|
$tac = $cul;
|
||||||
|
$cul = '';
|
||||||
|
$cul .= $chars[rand @chars] for 1..33;
|
||||||
}
|
}
|
||||||
$path = '';
|
$path = '';
|
||||||
$id = $what++;
|
$id = $what++;
|
||||||
|
@ -57,14 +61,13 @@ while (<>) {
|
||||||
print DC $_;
|
print DC $_;
|
||||||
close DC;
|
close DC;
|
||||||
|
|
||||||
# assuming we have a file ...
|
|
||||||
if ($tac) {
|
if ($tac) {
|
||||||
print "$tac\n";
|
print "$tac\n";
|
||||||
print "$cul\n";
|
print "$cul\n";
|
||||||
|
|
||||||
my $duplicated = `grep -o \"$url\" $file | wc -l`;
|
my $duplicated = `grep -o \"$url\" $file | wc -l`;
|
||||||
if ($duplicated > 1) {
|
if ($duplicated > 1) {
|
||||||
print "DUPLICATED";
|
print "DUPLICATED\n";
|
||||||
next;
|
next;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,7 +75,7 @@ while (<>) {
|
||||||
#system "curl -s -L \"$tac\" -o import/$id/$cul";
|
#system "curl -s -L \"$tac\" -o import/$id/$cul";
|
||||||
|
|
||||||
my $output = `file import/$id/$cul`;
|
my $output = `file import/$id/$cul`;
|
||||||
print $output;
|
print "$output\n";
|
||||||
|
|
||||||
$ext = '';
|
$ext = '';
|
||||||
|
|
||||||
|
@ -98,4 +101,7 @@ while (<>) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
print "eliminamos $id\n";
|
||||||
|
system "rm -rf import/$id";
|
||||||
|
|
||||||
__END__
|
__END__
|
||||||
|
|
|
@ -0,0 +1,38 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
cd "$1"
|
||||||
|
|
||||||
|
start="$2"
|
||||||
|
end="$3"
|
||||||
|
for i in $(seq $start $end); do
|
||||||
|
cd "$i"
|
||||||
|
cat dublin_core.xml | grep -Eo "(http|https)://[a-zA-Z0-9./?=_%&;:-]*" | sort -u | while read url; do
|
||||||
|
filename="$(cat /dev/urandom | gtr -dc 'a-zA-Z0-9_' | fold -w 32 | head -n 1)"
|
||||||
|
|
||||||
|
echo "$url"
|
||||||
|
if echo "$url" | grep -q -e "file\|docs"; then
|
||||||
|
id="$(echo "$url" | sed -r 's@.*(d/)([^?&/]+).*@\2@')"
|
||||||
|
url="https://drive.google.com/uc?export=download&id=$id"
|
||||||
|
fi
|
||||||
|
|
||||||
|
curl -L "$url" -o "$filename"
|
||||||
|
|
||||||
|
ext=""
|
||||||
|
output="$(file $filename)"
|
||||||
|
if echo $output | grep -q PDF; then
|
||||||
|
ext="pdf"
|
||||||
|
mv $filename $filename.pdf
|
||||||
|
fi
|
||||||
|
if echo $output | grep -q Word; then
|
||||||
|
soffice --headless --convert-to pdf $filename;
|
||||||
|
mv $filename $filename.docx;
|
||||||
|
ext="word"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$ext" = 'pdf' ] || [ "$ext" = 'docx' ]; then
|
||||||
|
printf "$filename.pdf" | tee -a contents
|
||||||
|
[ $ext = 'docx' ] && printf "$filename.docx" | tee -a contents
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
cd ..
|
||||||
|
done
|
|
@ -40,6 +40,12 @@ while (my $blob = <>) { # suck in one MARC record at a time
|
||||||
$element = 'contributor';
|
$element = 'contributor';
|
||||||
$qualifier = 'author';
|
$qualifier = 'author';
|
||||||
$content =~ s/[0-9]//g;
|
$content =~ s/[0-9]//g;
|
||||||
|
$content =~ s/comp\.//g;
|
||||||
|
$content =~ s/col\.//g;
|
||||||
|
$content =~ s/dir\.//g;
|
||||||
|
$content =~ s/pról\.//g;
|
||||||
|
$content =~ s/coord\.//g;
|
||||||
|
$content =~ s/ed\.//g;
|
||||||
$content =~ s/^\s+|\s+$//g;
|
$content =~ s/^\s+|\s+$//g;
|
||||||
}
|
}
|
||||||
if ($element eq 'format') {
|
if ($element eq 'format') {
|
||||||
|
|
Loading…
Reference in New Issue