From 0c3d7b1c7283f9df91e36cc33fdaa967cd7d8762 Mon Sep 17 00:00:00 2001 From: Santiago Lo Coco Date: Sun, 23 Jul 2023 13:47:47 +0200 Subject: [PATCH] Add initial .pl files --- build.pl | 79 +++++++++++++++++++++++++++++++++++++++++++++ marc2dc.pl | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100755 build.pl create mode 100755 marc2dc.pl diff --git a/build.pl b/build.pl new file mode 100755 index 0000000..3e9c9f7 --- /dev/null +++ b/build.pl @@ -0,0 +1,79 @@ +#!/usr/bin/perl -w + +use utf8; +binmode(STDOUT, ":utf8"); + +$/ = "\n"; # record separator + +$what = 1000; # dummy id for when there’s no file + +while (<>) { + $cul = ""; + $tac = ""; + + # discard the top and bottom tags + s/\n//; + s/<\/collection>\n//; + + # extract the file path from the identifier + # use the file name as an id + # note that identifier element is discarded! + #if (s!\s*(.*?)\s*<\/dcvalue>\n!!s) { + if (/\s*(.*?)\s*<\/dcvalue>\n/) { + $cul = $1; + if ($cul =~ /docs/) { + #$cul =~ s@.*(folders/|d/)([^?&/]+).*@\2@; + $cul =~ s@.*(d/)([^?&/]+).*@$2@; + $tac = "https://drive.google.com/uc?export=download&id=$cul"; + print "$tac\n"; + print "\n"; + } + $path = ''; + $id = $what++; + } else { + $path = ''; + $id = $what++; + } + + # let the operator know where we’re up to + print "$path/$id\n"; + + # create the item directory + mkdir "import/$id", 0755; + + # create the dublin_core.xml file + open DC, ">import/$id/dublin_core.xml" + or die "Cannot open dublin core for $id, $!\n"; + print DC $_; + close DC; + + # assuming we have a file ... + if ($tac) { + print "$tac\n"; + print "$cul\n"; + + system "curl -L \"$tac\" -o import/$id/$cul"; + #system "curl -s -L \"$tac\" -o import/$id/$cul"; + + my $output = `file import/$id/$cul`; + print $output; + + if ($output =~ /PDF/) { + $ext = "pdf"; + } + if ($output =~ /Word/) { + system "soffice --headless --convert-to pdf import/$id/$cul"; + $ext = "pdf"; + } + + system "mv import/$id/$cul import/$id/$cul.$ext"; + + # ... create the contents file ... + open OUT, ">import/$id/contents" + or die "Cannot open contents for $id, $!\n"; + print OUT "$cul.$ext"; + close OUT; + } +} + +__END__ diff --git a/marc2dc.pl b/marc2dc.pl new file mode 100755 index 0000000..82b9ecc --- /dev/null +++ b/marc2dc.pl @@ -0,0 +1,95 @@ +#!/usr/bin/perl -w + +use MARC::Crosswalk::DublinCore; +use MARC::File::USMARC; +use utf8; +binmode(STDOUT, ":utf8"); + +$/ = chr(29); # MARC record separator + +print qq|\n|; + +while (my $blob = <>) { # suck in one MARC record at a time + + print qq|\n|; + + # convert the MARC to DC + my $marc = MARC::Record->new_from_usmarc( $blob ); + my $crosswalk = MARC::Crosswalk::DublinCore->new( qualified => 0 ); + my $dc = $crosswalk->as_dublincore( $marc ); + + # output the DC as XML + for( $dc->elements ) { + my $element = lc($_->name); + my $qualifier = lc($_->qualifier); + my $scheme = lc($_->scheme); + my $content = $_->content; + + ##print $_->content; + + #print qq|$element\n|; + #print qq|$qualifier\n|; + #print qq|$scheme\n|; + #print qq|$content\n|; + + # escape reserved characters + $content =~ s/&/&/gs; + $content =~ s//>/gs; + + # munge attributes for DSpace compatibility + if ($element eq 'creator') { + $element = 'contributor'; + $qualifier = 'author'; + $content =~ s/[0-9]//g; + $content =~ s/^\s+|\s+$//g; + } + if ($element eq 'format') { + $element = 'description'; + $qualifier = ''; + } + if ($element eq 'language') { + $qualifier = 'iso'; + } + if ($element eq 'type') { + $element = 'type'; + $qualifier = ''; + } + if ($element eq 'subject') { + $content =~ s/[0-9]//g; + $content =~ s/^\s+|\s+$//g; + } + if ($element eq 'title') { + $content =~ s/\[[^][]*\]//g; + $content =~ s/^\s+|\s+$//g; + } + if ($element eq 'identifier') { + $element = 'relation'; + $qualifier = 'uri'; + } + + printf qq| %s\n|, $content; + } else { + printf qq|>%s\n|, $content; + } + } + + # if ($ARGV[0]) { + # printf qq| %s\n|, $content; + # } + + print qq|\n|; +} + +print qq|\n|; + +exit;