Add initial .pl files

This commit is contained in:
Santiago Lo Coco 2023-07-23 13:47:47 +02:00
commit 0c3d7b1c72
2 changed files with 174 additions and 0 deletions

79
build.pl Executable file
View File

@ -0,0 +1,79 @@
#!/usr/bin/perl -w
use utf8;
binmode(STDOUT, ":utf8");
$/ = "</dublin_core>\n"; # record separator
$what = 1000; # dummy id for when theres no file
while (<>) {
$cul = "";
$tac = "";
# discard the top and bottom tags
s/<collection>\n//;
s/<\/collection>\n//;
# extract the file path from the identifier
# use the file name as an id
# note that identifier element is discarded!
#if (s!<dcvalue element="relation" qualifier="uri">\s*(.*?)\s*<\/dcvalue>\n!!s) {
if (/<dcvalue element="relation" qualifier="uri">\s*(.*?)\s*<\/dcvalue>\n/) {
$cul = $1;
if ($cul =~ /docs/) {
#$cul =~ s@.*(folders/|d/)([^?&/]+).*@\2@;
$cul =~ s@.*(d/)([^?&/]+).*@$2@;
$tac = "https://drive.google.com/uc?export=download&id=$cul";
print "$tac\n";
print "\n";
}
$path = '';
$id = $what++;
} else {
$path = '';
$id = $what++;
}
# let the operator know where were up to
print "$path/$id\n";
# create the item directory
mkdir "import/$id", 0755;
# create the dublin_core.xml file
open DC, ">import/$id/dublin_core.xml"
or die "Cannot open dublin core for $id, $!\n";
print DC $_;
close DC;
# assuming we have a file ...
if ($tac) {
print "$tac\n";
print "$cul\n";
system "curl -L \"$tac\" -o import/$id/$cul";
#system "curl -s -L \"$tac\" -o import/$id/$cul";
my $output = `file import/$id/$cul`;
print $output;
if ($output =~ /PDF/) {
$ext = "pdf";
}
if ($output =~ /Word/) {
system "soffice --headless --convert-to pdf import/$id/$cul";
$ext = "pdf";
}
system "mv import/$id/$cul import/$id/$cul.$ext";
# ... create the contents file ...
open OUT, ">import/$id/contents"
or die "Cannot open contents for $id, $!\n";
print OUT "$cul.$ext";
close OUT;
}
}
__END__

95
marc2dc.pl Executable file
View File

@ -0,0 +1,95 @@
#!/usr/bin/perl -w
use MARC::Crosswalk::DublinCore;
use MARC::File::USMARC;
use utf8;
binmode(STDOUT, ":utf8");
$/ = chr(29); # MARC record separator
print qq|<collection>\n|;
while (my $blob = <>) { # suck in one MARC record at a time
print qq|<dublin_core>\n|;
# convert the MARC to DC
my $marc = MARC::Record->new_from_usmarc( $blob );
my $crosswalk = MARC::Crosswalk::DublinCore->new( qualified => 0 );
my $dc = $crosswalk->as_dublincore( $marc );
# output the DC as XML
for( $dc->elements ) {
my $element = lc($_->name);
my $qualifier = lc($_->qualifier);
my $scheme = lc($_->scheme);
my $content = $_->content;
##print $_->content;
#print qq|$element\n|;
#print qq|$qualifier\n|;
#print qq|$scheme\n|;
#print qq|$content\n|;
# escape reserved characters
$content =~ s/&/&amp;/gs;
$content =~ s/</&lt;/gs;
$content =~ s/>/&gt;/gs;
# munge attributes for DSpace compatibility
if ($element eq 'creator') {
$element = 'contributor';
$qualifier = 'author';
$content =~ s/[0-9]//g;
$content =~ s/^\s+|\s+$//g;
}
if ($element eq 'format') {
$element = 'description';
$qualifier = '';
}
if ($element eq 'language') {
$qualifier = 'iso';
}
if ($element eq 'type') {
$element = 'type';
$qualifier = '';
}
if ($element eq 'subject') {
$content =~ s/[0-9]//g;
$content =~ s/^\s+|\s+$//g;
}
if ($element eq 'title') {
$content =~ s/\[[^][]*\]//g;
$content =~ s/^\s+|\s+$//g;
}
if ($element eq 'identifier') {
$element = 'relation';
$qualifier = 'uri';
}
printf qq| <dcvalue element="%s"|, $element;
printf qq| qualifier="%s"|, $qualifier if $qualifier;
printf qq| qualifier="%s"|, $scheme if $scheme and !$qualifier;
if ($element eq 'title' || $element eq 'description') {
#if ($element eq 'title' || $element eq 'description' || $element eq 'subject') {
printf qq| language="es_AR">%s</dcvalue>\n|, $content;
} else {
printf qq|>%s</dcvalue>\n|, $content;
}
}
# if ($ARGV[0]) {
# printf qq| <dcvalue element="dspace"|, $element;
# printf qq| qualifier="entity"|, $qualifier if $qualifier;
# printf qq| qualifier="%s"|, $scheme if $scheme and !$qualifier;
# printf qq|>%s</dcvalue>\n|, $content;
# }
print qq|</dublin_core>\n|;
}
print qq|</collection>\n|;
exit;