diff --git a/.gitignore b/.gitignore index 5532a0e..1d805d2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ marc.pl other/ +*.xml +*.iso2709 diff --git a/build.pl b/build.pl index 02ef90c..cc3257d 100755 --- a/build.pl +++ b/build.pl @@ -3,6 +3,8 @@ use utf8; binmode(STDOUT, ":utf8"); +my @chars = ("A".."Z", "a".."z", "1".."9", "_"); + $/ = "\n"; # record separator $what = 1000; # dummy id for when there’s no file @@ -35,8 +37,10 @@ while (<>) { #$cul =~ s@.*(folders/|d/)([^?&/]+).*@\2@; $cul =~ s@.*(d/)([^?&/]+).*@$2@; $tac = "https://drive.google.com/uc?export=download&id=$cul"; - print "$tac\n"; - print "\n"; + } elsif ($cul =~ /pdf/ && $cul =~ /usi/) { + $tac = $cul; + $cul = ''; + $cul .= $chars[rand @chars] for 1..33; } $path = ''; $id = $what++; @@ -57,14 +61,13 @@ while (<>) { print DC $_; close DC; - # assuming we have a file ... if ($tac) { print "$tac\n"; print "$cul\n"; my $duplicated = `grep -o \"$url\" $file | wc -l`; if ($duplicated > 1) { - print "DUPLICATED"; + print "DUPLICATED\n"; next; } @@ -72,7 +75,7 @@ while (<>) { #system "curl -s -L \"$tac\" -o import/$id/$cul"; my $output = `file import/$id/$cul`; - print $output; + print "$output\n"; $ext = ''; @@ -98,4 +101,7 @@ while (<>) { } } +print "eliminamos $id\n"; +system "rm -rf import/$id"; + __END__ diff --git a/download-files.sh b/download-files.sh new file mode 100644 index 0000000..6e7f219 --- /dev/null +++ b/download-files.sh @@ -0,0 +1,38 @@ +#!/bin/sh + +cd "$1" + +start="$2" +end="$3" +for i in $(seq $start $end); do + cd "$i" + cat dublin_core.xml | grep -Eo "(http|https)://[a-zA-Z0-9./?=_%&;:-]*" | sort -u | while read url; do + filename="$(cat /dev/urandom | gtr -dc 'a-zA-Z0-9_' | fold -w 32 | head -n 1)" + + echo "$url" + if echo "$url" | grep -q -e "file\|docs"; then + id="$(echo "$url" | sed -r 's@.*(d/)([^?&/]+).*@\2@')" + url="https://drive.google.com/uc?export=download&id=$id" + fi + + curl -L "$url" -o "$filename" + + ext="" + output="$(file $filename)" + if echo $output | grep -q PDF; then + ext="pdf" + mv $filename $filename.pdf + fi + if echo $output | grep -q Word; then + soffice --headless --convert-to pdf $filename; + mv $filename $filename.docx; + ext="word" + fi + + if [ "$ext" = 'pdf' ] || [ "$ext" = 'docx' ]; then + printf "$filename.pdf" | tee -a contents + [ $ext = 'docx' ] && printf "$filename.docx" | tee -a contents + fi + done + cd .. +done diff --git a/marc2dc.pl b/marc2dc.pl index e8846b2..0f09a54 100755 --- a/marc2dc.pl +++ b/marc2dc.pl @@ -40,6 +40,12 @@ while (my $blob = <>) { # suck in one MARC record at a time $element = 'contributor'; $qualifier = 'author'; $content =~ s/[0-9]//g; + $content =~ s/comp\.//g; + $content =~ s/col\.//g; + $content =~ s/dir\.//g; + $content =~ s/pról\.//g; + $content =~ s/coord\.//g; + $content =~ s/ed\.//g; $content =~ s/^\s+|\s+$//g; } if ($element eq 'format') {