borg_public/docs/wiki_download.sh
2023-05-29 10:41:03 +02:00

214 lines
5.1 KiB
Bash

#!/usr/bin/env bash
#Needs curl and jq
check_command() {
cmd="$1"
msg="$2"
if ! command -v "${cmd}" > /dev/null 2>&1; then
echo "${cmd} is not available. Please install it. ${msg}";
echo "Abort run."
echo
exit 1
fi
}
check_command curl
check_command jq
if test $# -eq 0; then
echo "This script needs a list of pages to download from the wiki"
exit 1
fi
ALL_PAGES="$@"
echo "Please enter the user name and password to log on to Wiki"
echo -n "User: "
read USERNAME
echo
echo -n "Password: "
read -s USERPASS
echo
PAGE="Title of an article"
PREFIX_WIKI="https://www.aquila-consortium.org/wiki"
WIKIAPI="${PREFIX_WIKI}/api.php"
cookie_jar="wikicj"
#Will store file in wikifile
echo "UTF8 check: ☠"
#################login
echo "Logging into $WIKIAPI as $USERNAME..."
###############
#Login part 1
#printf "%s" "Logging in (1/2)..."
echo "Get login token..."
CR=$(curl -s -S \
--location \
--retry 2 \
--retry-delay 5\
--cookie $cookie_jar \
--cookie-jar $cookie_jar \
--user-agent "Curl Shell Script" \
--keepalive-time 60 \
--header "Accept-Language: en-us" \
--header "Connection: keep-alive" \
--compressed \
--request "GET" "${WIKIAPI}?action=query&meta=tokens&type=login&format=json")
echo "$CR" | jq .
rm -f login.json
echo "$CR" > login.json
TOKEN=$(jq --raw-output '.query.tokens.logintoken' login.json)
TOKEN="${TOKEN//\"/}" #replace double quote by nothing
#Remove carriage return!
printf "%s" "$TOKEN" > token.txt
TOKEN=$(cat token.txt | sed 's/\r$//')
if [ "$TOKEN" == "null" ]; then
echo "Getting a login token failed."
exit
else
echo "Login token is $TOKEN"
echo "-----"
fi
###############
#Login part 2
echo "Logging in..."
CR=$(curl -s -S \
--location \
--cookie $cookie_jar \
--cookie-jar $cookie_jar \
--user-agent "Curl Shell Script" \
--keepalive-time 60 \
--header "Accept-Language: en-us" \
--header "Connection: keep-alive" \
--compressed \
--data-urlencode "username=${USERNAME}" \
--data-urlencode "password=${USERPASS}" \
--data-urlencode "rememberMe=1" \
--data-urlencode "logintoken=${TOKEN}" \
--data-urlencode "loginreturnurl=http://www.aquila-consortium.org/wiki/" \
--request "POST" "${WIKIAPI}?action=clientlogin&format=json")
echo "$CR" | jq .
STATUS=$(echo $CR | jq '.clientlogin.status')
if [[ $STATUS == *"PASS"* ]]; then
echo "Successfully logged in as $USERNAME, STATUS is $STATUS."
echo "-----"
else
echo "Unable to login, is logintoken ${TOKEN} correct?"
exit
fi
OUTFORMAT="rst"
download() {
local d_title=$1
local d_outfile=$2
curl -s -S \
--location \
--cookie-jar wikicj \
--cookie wikicj \
"${PREFIX_WIKI}/index.php/${d_title}?action=raw" \
| pandoc -f mediawiki -t ${OUTFORMAT} \
| sed '/`.* <http.*>`/ { b }; s%`\(.*\) <\(.*\)>`%`\1 <\2.html>`%g' > ${d_outfile}
#The last command protects absolute URL but change relative links to html pages.
}
download_url() {
local d_url="$1"
local d_out="$2"
echo "Downloading from $d_url..."
curl -s -S \
--location \
--cookie-jar wikicj \
--cookie wikicj \
"$d_url" > ${d_out}
}
query_image() {
local image=$1
local result=$(curl -s -S \
--location \
--cookie-jar wikicj \
--cookie wikicj \
"${WIKIAPI}/api.php?action=query&prop=imageinfo&iiprop=url&format=json&titles=File:${image}")
r=$(echo "$result" | jq '.query.pages | keys[0]')
r2=$(echo "$result" | jq -r ".query.pages.${r}.imageinfo[0].url")
echo $r2
return 0
}
test -d download || mkdir download
for TITLE in ${ALL_PAGES}; do
OUTFILE=download/${TITLE}.rst
download ${TITLE} ${OUTFILE}
grep '\.\. figure' ${OUTFILE} | awk -F ': ' '{ print $2; }' > image_list
(while read; do
url=$(query_image "$REPLY")
download_url "${url}" "download/${REPLY}"
done) < image_list
done
# ###############
# #Get edit token
# echo "Fetching edit token..."
# CR=$(curl -S \
# --location \
# --cookie $cookie_jar \
# --cookie-jar $cookie_jar \
# --user-agent "Curl Shell Script" \
# --keepalive-time 60 \
# --header "Accept-Language: en-us" \
# --header "Connection: keep-alive" \
# --compressed \
# --request "POST" "${WIKIAPI}?action=query&meta=tokens&format=json")
#
# echo "$CR" | jq .
# echo "$CR" > edittoken.json
# EDITTOKEN=$(jq --raw-output '.query.tokens.csrftoken' edittoken.json)
# rm edittoken.json
#
# EDITTOKEN="${EDITTOKEN//\"/}" #replace double quote by nothing
#
# #Remove carriage return!
# printf "%s" "$EDITTOKEN" > edittoken.txt
# EDITTOKEN=$(cat edittoken.txt | sed 's/\r$//')
#
# if [[ $EDITTOKEN == *"+\\"* ]]; then
# echo "Edit token is: $EDITTOKEN"
# else
# echo "Edit token not set."
# exit
# fi
#
# ###############
# #Make a test edit
# #EDITTOKEN="d55014d69f1a8c821073bb6724aced7658904018+\\"
# CR=$(curl -S \
# --location \
# --cookie $cookie_jar \
# --cookie-jar $cookie_jar \
# --user-agent "Curl Shell Script" \
# --keepalive-time 60 \
# --header "Accept-Language: en-us" \
# --header "Connection: keep-alive" \
# --compressed \
# --data-urlencode "title=${PAGE}" \
# --data-urlencode "appendtext={{nocat|2017|01|31}}" \
# --data-urlencode "token=${EDITTOKEN}" \
# --request "POST" "${WIKIAPI}?action=edit&format=json")
#
# echo "$CR" | jq .