#!/bin/sh if [ "$1" = "-d" ]; then # delete all .md files if [ "$2" = "-html" ]; then # delete all .html files find . -name "*.ht*" | while read i; do rm "$i"; echo "$i deleted"; done fi if [ "$2" = "-md" ]; then # delete all .md files find . -name "*.md" | while read i; do rm "$i"; echo "$i deleted"; done fi fi if [ "$1" = "-w" ]; then # download html pages wget -bqc --mirror --convert-links --adjust-extension --page-requisites --no-parent https://docs.it4i.cz; pid=$(pgrep wget); FILE="docs.it4i.cz/@@search?sort_on=sortable_title&.html" # test exists file -> yes - kill process wget until test -f $FILE do if [ ! -f "$FILE" ]; then echo "Download..." fi sleep 5 done echo "Download complete..." kill $pid find . -name "@@s*" | while read i; do rm "$i"; done fi if [ "$1" = "-c" ]; then # delete all .md files find . -name "*.md" | while read i; do rm "$i"; echo "$i deleted"; done # create folder info mkdir info; find . -name "*.png" | while read i; do echo "$i" >> ./info/list_png.txt; done find . -name "*.jpg" | while read i; do echo "$i" >> ./info/list_jpg.txt; done # html -> md find . -name "*.ht*" | while read i; do # filtering html echo "$i"; echo "\t\tfiltering html files..."; HEAD=$(cat "$i" | grep -n -m1 '<h1' |cut -f1 -d: | tr --delete '\n') END=$(cat "$i" | grep -n -m1 '<!-- <div tal:content=' |cut -f1 -d: | tr --delete '\n') LAST=$(cat "$i" | wc -l | tr --delete '\n') DOWN=$((LAST-END+2)) cat "$i" | sed '1,'"$((HEAD-1))"'d' | sed -n -e :a -e '1,'"$DOWN"'!{P;N;D;};N;ba' > "${i%.*}TMP.html" # converted .html to .md echo "\t\t.html -> .md" pandoc -f html -t markdown+pipe_tables-grid_tables "${i%.*}TMP.html" -o "${i%.*}.md"; rm "${i%.*}TMP.html"; # folder info, file strukture, list of all files and his addres into folders echo "${i%.*}" >> ./info/files_md.txt; # create filter_auto cat "${i%.*}.md" | grep -o -P '(?<={).*(?=})' | sort -u | sed '/{/d' | sed '/\$/d' >> filter_auto; sort -u filter_auto -o filter_auto; # exceptions filter_auto cat exceptions_filter_auto | while read y; do # search and delete according with filter_auto cat filter_auto | sed -e 's/'"$y"'//g' > filter_autoTMP; cat filter_autoTMP > filter_auto; done # text filtering of html, css, ... echo "\t\tautomatic filter..." cat filter_auto | while read y; do # search and delete according with filter_auto cat "${i%.*}.md" | sed -e 's/{'"$y"'}//g' | sed -e 's/\\//g' | sed -e 's/: //g' | sed -e 's/<\/div>//g' | sed '/^<div/d' | sed '/^$/d' > "${i%.*}TMP.md"; cat "${i%.*}TMP.md" > "${i%.*}.md"; done echo "\t\tother filter..." cat filter_other | while read a; do # search and delete according with filter_other cat "${i%.*}.md" | sed -e 's/'"$a"'//g' > "${i%.*}TMP.md"; cat "${i%.*}TMP.md" > "${i%.*}.md"; done # delete temporary files rm "${i%.*}TMP.md"; done rm filter_autoTMP rm filter_auto fi