39 lines
941 B
Bash
39 lines
941 B
Bash
#!/bin/sh
|
|
|
|
cd "$1"
|
|
|
|
start="$2"
|
|
end="$3"
|
|
for i in $(seq $start $end); do
|
|
cd "$i"
|
|
cat dublin_core.xml | grep -Eo "(http|https)://[a-zA-Z0-9./?=_%&;:-]*" | sort -u | while read url; do
|
|
filename="$(cat /dev/urandom | gtr -dc 'a-zA-Z0-9_' | fold -w 32 | head -n 1)"
|
|
|
|
echo "$url"
|
|
if echo "$url" | grep -q -e "file\|docs"; then
|
|
id="$(echo "$url" | sed -r 's@.*(d/)([^?&/]+).*@\2@')"
|
|
url="https://drive.google.com/uc?export=download&id=$id"
|
|
fi
|
|
|
|
curl -L "$url" -o "$filename"
|
|
|
|
ext=""
|
|
output="$(file $filename)"
|
|
if echo $output | grep -q PDF; then
|
|
ext="pdf"
|
|
mv $filename $filename.pdf
|
|
fi
|
|
if echo $output | grep -q Word; then
|
|
soffice --headless --convert-to pdf $filename;
|
|
mv $filename $filename.docx;
|
|
ext="word"
|
|
fi
|
|
|
|
if [ "$ext" = 'pdf' ] || [ "$ext" = 'docx' ]; then
|
|
printf "$filename.pdf" | tee -a contents
|
|
[ $ext = 'docx' ] && printf "$filename.docx" | tee -a contents
|
|
fi
|
|
done
|
|
cd ..
|
|
done
|