html_md.sh

#!/bin/sh

if [ "$1" = "-d" ]; then
	# delete all .md files
	if [ "$2" = "-html" ]; then
		# delete all .html files
		find . -name "*.ht*" |
		while read i;
		do
			rm "$i";
			echo "$i deleted";
		done
	fi
	if [ "$2" = "-md" ]; then
		# delete all .md files
		find . -name "*.md" |
		while read i;
		do
			rm "$i";
			echo "$i deleted";
		done
	fi
fi
if [ "$1" = "-w" ]; then
	# download html pages
	wget -bqc --mirror --convert-links --adjust-extension --page-requisites  --no-parent https://docs.it4i.cz;
	pid=$(pgrep wget);

	FILE="docs.it4i.cz/@@search?sort_on=sortable_title&.html"

	# test exists file -> yes - kill process wget

	until test -f $FILE
	do
	if [ ! -f "$FILE" ]; then
		echo "Download..."
	fi
	sleep 5
	done

	echo "Download complete..."
	kill $pid

	find . -name "@@s*" |
	while read i;
	do
		rm "$i";
	done

fi
if [ "$1" = "-c" ]; then
	# delete all .md files
	find . -name "*.md" |
	while read i;
	do
		rm "$i";
		echo "$i deleted";
	done

	# create folder info
	mkdir info;

	find . -name "*.png" |
	while read i;
	do
		echo "$i" >> ./info/list_png.txt;
	done

	find . -name "*.jpg" |
	while read i;
	do
		echo "$i" >> ./info/list_jpg.txt;
	done


	# html -> md
	find . -name "*.ht*" |
	while read i;
	do
		# filtering html
		echo "$i";
		echo "\t\tfiltering html files...";

		HEAD=$(cat "$i" | grep -n -m1 '<h1' |cut -f1 -d: | tr --delete '\n')
		END=$(cat "$i" | grep -n -m1 '<!-- <div tal:content=' |cut -f1 -d: | tr --delete '\n')
		LAST=$(cat "$i" | wc -l | tr --delete '\n')
		DOWN=$((LAST-END+2))

		cat "$i" | sed '1,'"$((HEAD-1))"'d' | sed -n -e :a -e '1,'"$DOWN"'!{P;N;D;};N;ba' > "${i%.*}TMP.html"

		# converted .html to .md
		echo "\t\t.html -> .md"
		pandoc -f html -t markdown+pipe_tables-grid_tables "${i%.*}TMP.html" -o "${i%.*}.md";
		rm "${i%.*}TMP.html";

		# folder info, file strukture, list of all files and his addres into folders
		echo "${i%.*}" >> ./info/files_md.txt;

		# create filter_auto
		cat "${i%.*}.md" | grep -o -P '(?<={).*(?=})' | sort -u | sed '/{/d' | sed '/\$/d' >> filter_auto;
		sort -u filter_auto -o filter_auto;

		# exceptions filter_auto
		cat exceptions_filter_auto |
		while read y;
		do
			# search and delete according with filter_auto
			cat filter_auto | sed -e 's/'"$y"'//g' > filter_autoTMP;
			cat filter_autoTMP > filter_auto;
		done

		# text filtering of html, css, ...
		echo "\t\tautomatic filter..."
		cat filter_auto |
		while read y;
		do
			# search and delete according with filter_auto
			cat "${i%.*}.md" | sed -e 's/{'"$y"'}//g' | sed -e 's/\\//g' | sed -e 's/: //g' | sed -e 's/<\/div>//g' | sed '/^<div/d'  | sed '/^$/d' > "${i%.*}TMP.md";
			cat "${i%.*}TMP.md" > "${i%.*}.md";
		done

		echo "\t\tother filter..."
		cat filter_other |
		while read a;
		do
			# search and delete according with filter_other
			cat "${i%.*}.md" | sed -e 's/'"$a"'//g'  > "${i%.*}TMP.md";
			cat "${i%.*}TMP.md" > "${i%.*}.md";
		done

		# delete temporary files
		rm "${i%.*}TMP.md";

	done
	rm filter_autoTMP
	rm filter_auto
fi