From 6c9c024b0b0075d2c7f449c670532a19a7f5f7b8 Mon Sep 17 00:00:00 2001 From: Santiago Lo Coco Date: Sun, 23 Jul 2023 17:28:25 +0200 Subject: [PATCH] Update *.pl files --- build.pl | 40 +++++++++++++++++++++++++++++++--------- marc2dc.pl | 18 ++++++++---------- 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/build.pl b/build.pl index 3e9c9f7..02ef90c 100755 --- a/build.pl +++ b/build.pl @@ -7,9 +7,18 @@ $/ = "\n"; # record separator $what = 1000; # dummy id for when there’s no file +$file = $ARGV[0]; + +mkdir "import", 0755; + while (<>) { $cul = ""; $tac = ""; + $url = ""; + + if (!(defined $_ and length $_ > 0)) { + exit; + } # discard the top and bottom tags s/\n//; @@ -21,7 +30,8 @@ while (<>) { #if (s!\s*(.*?)\s*<\/dcvalue>\n!!s) { if (/\s*(.*?)\s*<\/dcvalue>\n/) { $cul = $1; - if ($cul =~ /docs/) { + $url = $1; + if ($cul =~ /docs/ || $cul =~ /file/) { #$cul =~ s@.*(folders/|d/)([^?&/]+).*@\2@; $cul =~ s@.*(d/)([^?&/]+).*@$2@; $tac = "https://drive.google.com/uc?export=download&id=$cul"; @@ -52,27 +62,39 @@ while (<>) { print "$tac\n"; print "$cul\n"; + my $duplicated = `grep -o \"$url\" $file | wc -l`; + if ($duplicated > 1) { + print "DUPLICATED"; + next; + } + system "curl -L \"$tac\" -o import/$id/$cul"; #system "curl -s -L \"$tac\" -o import/$id/$cul"; my $output = `file import/$id/$cul`; print $output; + $ext = ''; + if ($output =~ /PDF/) { $ext = "pdf"; + system "mv import/$id/$cul import/$id/$cul.pdf"; } if ($output =~ /Word/) { system "soffice --headless --convert-to pdf import/$id/$cul"; - $ext = "pdf"; + $ext = "docx"; + system "mv import/$id/$cul import/$id/$cul.docx"; + system "mv $cul.pdf import/$id/$cul.pdf"; } - system "mv import/$id/$cul import/$id/$cul.$ext"; - - # ... create the contents file ... - open OUT, ">import/$id/contents" - or die "Cannot open contents for $id, $!\n"; - print OUT "$cul.$ext"; - close OUT; + if ($ext eq 'pdf' || $ext eq 'docx') { + # ... create the contents file ... + open OUT, ">import/$id/contents" + or die "Cannot open contents for $id, $!\n"; + print OUT "$cul.pdf"; + print OUT "\n$cul.docx" if $ext eq 'docx'; + close OUT; + } } } diff --git a/marc2dc.pl b/marc2dc.pl index 82b9ecc..e8846b2 100755 --- a/marc2dc.pl +++ b/marc2dc.pl @@ -11,27 +11,25 @@ print qq|\n|; while (my $blob = <>) { # suck in one MARC record at a time - print qq|\n|; - # convert the MARC to DC my $marc = MARC::Record->new_from_usmarc( $blob ); my $crosswalk = MARC::Crosswalk::DublinCore->new( qualified => 0 ); my $dc = $crosswalk->as_dublincore( $marc ); + my $has_content = 0; + # output the DC as XML for( $dc->elements ) { + if (!$has_content) { + $has_content = 1; + print qq|\n|; + } + my $element = lc($_->name); my $qualifier = lc($_->qualifier); my $scheme = lc($_->scheme); my $content = $_->content; - ##print $_->content; - - #print qq|$element\n|; - #print qq|$qualifier\n|; - #print qq|$scheme\n|; - #print qq|$content\n|; - # escape reserved characters $content =~ s/&/&/gs; $content =~ s/) { # suck in one MARC record at a time # printf qq|>%s\n|, $content; # } - print qq|\n|; + print qq|\n| if $has_content; } print qq|\n|;