Forked from
SCS / docs.it4i.cz
1529 commits behind, 6 commits ahead of the upstream repository.
-
Lukáš Krupčík authoredLukáš Krupčík authored
html_md.sh 3.00 KiB
#!/bin/sh
if [ "$1" = "-d" ]; then
# delete all .md files
if [ "$2" = "-html" ]; then
# delete all .html files
find . -name "*.ht*" |
while read i;
do
rm "$i";
echo "$i deleted";
done
fi
if [ "$2" = "-md" ]; then
# delete all .md files
find . -name "*.md" |
while read i;
do
rm "$i";
echo "$i deleted";
done
fi
fi
if [ "$1" = "-w" ]; then
# download html pages
wget -bqc --mirror --convert-links --adjust-extension --page-requisites --no-parent https://docs.it4i.cz;
pid=$(pgrep wget);
FILE="docs.it4i.cz/@@search?sort_on=sortable_title&.html"
# test exists file -> yes - kill process wget
until test -f $FILE
do
if [ ! -f "$FILE" ]; then
echo "Download..."
fi
sleep 5
done
echo "Download complete..."
kill $pid
find . -name "@@s*" |
while read i;
do
rm "$i";
done
fi
if [ "$1" = "-c" ]; then
# delete all .md files
find . -name "*.md" |
while read i;
do
rm "$i";
echo "$i deleted";
done
# create folder info
mkdir info;
find . -name "*.png" |
while read i;
do
echo "$i" >> ./info/list_png.txt;
done
find . -name "*.jpg" |
while read i;
do
echo "$i" >> ./info/list_jpg.txt;
done
# html -> md
find . -name "*.ht*" |
while read i;
do
# filtering html
echo "$i";
echo "\t\tfiltering html files...";
HEAD=$(cat "$i" | grep -n -m1 '<h1' |cut -f1 -d: | tr --delete '\n')
END=$(cat "$i" | grep -n -m1 '<!-- <div tal:content=' |cut -f1 -d: | tr --delete '\n')
LAST=$(cat "$i" | wc -l | tr --delete '\n')
DOWN=$((LAST-END+2))
cat "$i" | sed '1,'"$((HEAD-1))"'d' | sed -n -e :a -e '1,'"$DOWN"'!{P;N;D;};N;ba' > "${i%.*}TMP.html"
# converted .html to .md
echo "\t\t.html -> .md"
pandoc -f html -t markdown+pipe_tables-grid_tables "${i%.*}TMP.html" -o "${i%.*}.md";
rm "${i%.*}TMP.html";
# folder info, file strukture, list of all files and his addres into folders
echo "${i%.*}" >> ./info/files_md.txt;
# create filter_auto
cat "${i%.*}.md" | grep -o -P '(?<={).*(?=})' | sort -u | sed '/{/d' | sed '/\$/d' >> filter_auto;
sort -u filter_auto -o filter_auto;
# exceptions filter_auto
cat exceptions_filter_auto |
while read y;
do
# search and delete according with filter_auto
cat filter_auto | sed -e 's/'"$y"'//g' > filter_autoTMP;
cat filter_autoTMP > filter_auto;
done
# text filtering of html, css, ...
echo "\t\tautomatic filter..."
cat filter_auto |
while read y;
do
# search and delete according with filter_auto
cat "${i%.*}.md" | sed -e 's/{'"$y"'}//g' | sed -e 's/\\//g' | sed -e 's/: //g' | sed -e 's/<\/div>//g' | sed '/^<div/d' | sed '/^$/d' > "${i%.*}TMP.md";
cat "${i%.*}TMP.md" > "${i%.*}.md";
done
echo "\t\tother filter..."
cat filter_other |
while read a;
do
# search and delete according with filter_other
cat "${i%.*}.md" | sed -e 's/'"$a"'//g' > "${i%.*}TMP.md";
cat "${i%.*}TMP.md" > "${i%.*}.md";
done
# delete temporary files
rm "${i%.*}TMP.md";
done
rm filter_autoTMP
rm filter_auto
fi