#!/usr/bin/perl -w

use utf8;
binmode(STDOUT, ":utf8");

my @chars = ("A".."Z", "a".."z", "1".."9", "_");

$/ = "</dublin_core>\n"; # record separator

$what = 1000; # dummy id for when there’s no file

$file = $ARGV[0];

mkdir "import", 0755;

while (<>) {
    $cul = "";
    $tac = "";
    $url = "";

    if (!(defined $_ and length $_ > 0)) {
	exit;
    }

    # discard the top and bottom tags
    s/<collection>\n//;
    s/<\/collection>\n//;

    # extract the file path from the identifier
    # use the file name as an id
    # note that identifier element is discarded!
    #if (s!<dcvalue element="relation" qualifier="uri">\s*(.*?)\s*<\/dcvalue>\n!!s) {
    if (/<dcvalue element="relation" qualifier="uri">\s*(.*?)\s*<\/dcvalue>\n/) {
	$cul = $1;
	$url = $1;
	if ($cul =~ /docs/ || $cul =~ /file/) {
		#$cul =~ s@.*(folders/|d/)([^?&/]+).*@\2@;
		$cul =~ s@.*(d/)([^?&/]+).*@$2@;
		$tac = "https://drive.google.com/uc?export=download&id=$cul";
	} elsif ($cul =~ /pdf/ && $cul =~ /usi/) {
		$tac = $cul;
		$cul = '';
		$cul .= $chars[rand @chars] for 1..33;
	}
        $path = '';
	$id = $what++;
    } else {
        $path = '';
        $id = $what++;
    }

    # let the operator know where we’re up to
    print "$path/$id\n";

    # create the item directory
    mkdir "import/$id", 0755;

    # create the dublin_core.xml file
    open DC, ">import/$id/dublin_core.xml"
      or die "Cannot open dublin core for $id, $!\n";
    print DC $_;
    close DC;

    if ($tac) {
	print "$tac\n";
	print "$cul\n";

	my $duplicated = `grep -o \"$url\" $file | wc -l`;
	if ($duplicated > 1) {
		print "DUPLICATED\n";
		next;
	}

	system "curl -L \"$tac\" -o import/$id/$cul";
	#system "curl -s -L \"$tac\" -o import/$id/$cul";

	my $output = `file import/$id/$cul`;
	print "$output\n";

	$ext = '';

	if ($output =~ /PDF/) {
		$ext = "pdf";
		system "mv import/$id/$cul import/$id/$cul.pdf";
	}
	if ($output =~ /Word/) {
		system "soffice --headless --convert-to pdf import/$id/$cul";
		$ext = "docx";
		system "mv import/$id/$cul import/$id/$cul.docx";
		system "mv $cul.pdf import/$id/$cul.pdf";
	}

	if ($ext eq 'pdf' || $ext eq 'docx') {
		# ... create the contents file ...
		open OUT, ">import/$id/contents"
		  or die "Cannot open contents for $id, $!\n";
		print OUT "$cul.pdf";
		print OUT "\n$cul.docx" if $ext eq 'docx';
        	close OUT;
	}
    }
}

print "eliminamos $id\n";
system "rm -rf import/$id";

__END__