koha-to-dspace/download-files.sh

39 lines
941 B
Bash

#!/bin/sh
cd "$1"
start="$2"
end="$3"
for i in $(seq $start $end); do
cd "$i"
cat dublin_core.xml | grep -Eo "(http|https)://[a-zA-Z0-9./?=_%&;:-]*" | sort -u | while read url; do
filename="$(cat /dev/urandom | gtr -dc 'a-zA-Z0-9_' | fold -w 32 | head -n 1)"
echo "$url"
if echo "$url" | grep -q -e "file\|docs"; then
id="$(echo "$url" | sed -r 's@.*(d/)([^?&/]+).*@\2@')"
url="https://drive.google.com/uc?export=download&id=$id"
fi
curl -L "$url" -o "$filename"
ext=""
output="$(file $filename)"
if echo $output | grep -q PDF; then
ext="pdf"
mv $filename $filename.pdf
fi
if echo $output | grep -q Word; then
soffice --headless --convert-to pdf $filename;
mv $filename $filename.docx;
ext="word"
fi
if [ "$ext" = 'pdf' ] || [ "$ext" = 'docx' ]; then
printf "$filename.pdf" | tee -a contents
[ $ext = 'docx' ] && printf "$filename.docx" | tee -a contents
fi
done
cd ..
done