#!/bin/sh cd "$1" start="$2" end="$3" for i in $(seq $start $end); do cd "$i" cat dublin_core.xml | grep -Eo "(http|https)://[a-zA-Z0-9./?=_%&;:-]*" | sort -u | while read url; do filename="$(cat /dev/urandom | gtr -dc 'a-zA-Z0-9_' | fold -w 32 | head -n 1)" echo "$url" if echo "$url" | grep -q -e "file\|docs"; then id="$(echo "$url" | sed -r 's@.*(d/)([^?&/]+).*@\2@')" url="https://drive.google.com/uc?export=download&id=$id" fi curl -L "$url" -o "$filename" ext="" output="$(file $filename)" if echo $output | grep -q PDF; then ext="pdf" mv $filename $filename.pdf fi if echo $output | grep -q Word; then soffice --headless --convert-to pdf $filename; mv $filename $filename.docx; ext="word" fi if [ "$ext" = 'pdf' ] || [ "$ext" = 'docx' ]; then printf "$filename.pdf" | tee -a contents [ $ext = 'docx' ] && printf "$filename.docx" | tee -a contents fi done cd .. done