Newer
Older
#!/usr/bin/env python3
# ##### BEGIN GPL LICENSE BLOCK #####
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
# ##### END GPL LICENSE BLOCK #####
# <pep8 compliant>
"""
Script for checking source code spelling.
python3 source/tools/check_source/check_spelling.py some_soure_file.py
- Pass in a path for it to be checked recursively.
- Pass in '--strings' to check strings instead of comments.
Currently only python source is checked.
"""
import os
import argparse
from typing import (
Callable,
Dict,
Generator,
List,
Optional,
Set,
Tuple,
)
# Report: word, line, column.
Report = Tuple[str, int, int]
# Cache: {filepath: length, hash, reports}.
CacheData = Dict[str, Tuple[int, bytes, List[Report]]]
# Map word to suggestions.
SuggestMap = Dict[str, str]
ONLY_ONCE = True
USE_COLOR = True
_words_visited = set()
_files_visited = set()
# Lowercase word -> suggestion list.
_suggest_map: SuggestMap = {}
VERBOSE_CACHE = False
if USE_COLOR:
COLOR_WORD = "\033[92m"
COLOR_ENDC = "\033[0m"
else:
COLOR_ENDC = ""
import enchant
dict_spelling = enchant.Dict("en_US")
# -----------------------------------------------------------------------------
# General Utilities
def hash_of_file_and_len(fp: str) -> Tuple[bytes, int]:
import hashlib
with open(fp, 'rb') as fh:
data = fh.read()
m = hashlib.sha512()
m.update(data)
return m.digest(), len(data)
import re
re_vars = re.compile("[A-Za-z]+")
# First remove this from comments, so we don't spell check example code, doxygen commands, etc.
re_ignore = re.compile(
r'('
# URL.
r'(https?|ftp)://\S+|'
# Email address: <me@email.com>
# <someone@foo.bar-baz.com>
r"<\w+@[\w\.\-]+>|"
# Convention for TODO/FIXME messages: TODO(my name) OR FIXME(name+name) OR XXX(some-name) OR NOTE(name/other-name):
r"\b(TODO|FIXME|XXX|NOTE)\([A-Za-z\s\+\-/]+\)|"
# Doxygen style: <pre> ... </pre>
r"<pre>.+</pre>|"
# Doxygen style: \code ... \endcode
r"\s+\\code\b.+\s\\endcode\b|"
# Doxygen style #SOME_CODE.
r'#\S+|'
# Doxygen commands: \param foo
r"\\(section|subsection|subsubsection|ingroup|param|page|a|see)\s+\S+|"
# Doxygen commands without any arguments after them: \command
r"\\(retval|todo)\b|"
# Doxygen 'param' syntax used rarely: \param foo[in,out]
r"\\param\[[a-z,]+\]\S*|"
# Words containing underscores: a_b
r'\S*\w+_\S+|'
# Words containing arrows: a->b
r'\S*\w+\->\S+'
# Words containing dot notation: a.b (NOT ab... since this is used in English).
r'\w+\.\w+\S*|'
# Single and back-tick quotes (often used to reference code).
r"\s\`[^\n`]+\`|"
r"\s'[^\n']+'"
r')',
re.MULTILINE | re.DOTALL,
)
# Then extract words.
re_words = re.compile(
r"\b("
# Capital words, with optional '-' and "'".
r"[A-Z]+[\-'A-Z]*[A-Z]|"
# Lowercase words, with optional '-' and "'".
r"[A-Za-z][\-'a-z]*[a-z]+"
r")\b"
)
re_not_newline = re.compile("[^\n]")
def words_from_text(text: str) -> List[Tuple[str, int]]:
""" Extract words to treat as English for spell checking.
"""
# Replace non-newlines with white-space, so all alignment is kept.
def replace_ignore(match: re.Match[str]) -> str:
start, end = match.span()
return re_not_newline.sub(" ", match.string[start:end])
# Handy for checking what we ignore, incase we ignore too much and miss real errors.
# for match in re_ignore.finditer(text):
# print(match.group(0))
# Strip out URL's, code-blocks, etc.
text = re_ignore.sub(replace_ignore, text)
words = []
for match in re_words.finditer(text):
words.append((match.group(0), match.start()))
def word_ok(w: str) -> bool:
# Ignore all uppercase words.
if w.isupper():
return False
return True
words[:] = [w for w in words if word_ok(w[0])]
return words
class Comment:
__slots__ = (
"file",
"text",
"line",
"type",
def __init__(self, file: str, text: str, line: int, type: str):
self.file = file
self.text = text
self.line = line
self.type = type
def parse(self) -> List[Tuple[str, int]]:
return words_from_text(self.text)
def line_and_column_from_comment_offset(self, pos: int) -> Tuple[int, int]:
text = self.text
slineno = self.line + text.count("\n", 0, pos)
# Allow for -1 to be not found.
scol = text.rfind("\n", 0, pos) + 1
if scol == 0:
# Not found.
scol = pos
else:
scol = pos - scol
return slineno, scol
def extract_code_strings(filepath: str) -> Tuple[List[Comment], Set[str]]:
import pygments
from pygments import lexers
from pygments.token import Token
comments = []
code_words = set()
# lex = lexers.find_lexer_class_for_filename(filepath)
# if lex is None:
# return comments, code_words
if filepath.endswith(".py"):
lex = lexers.get_lexer_by_name("python")
else:
lex = lexers.get_lexer_by_name("c")
slineno = 0
with open(filepath, encoding='utf-8') as fh:
source = fh.read()
for ty, ttext in lex.get_tokens(source):
if ty in {Token.Literal.String, Token.Literal.String.Double, Token.Literal.String.Single}:
comments.append(Comment(filepath, ttext, slineno, 'STRING'))
else:
for match in re_vars.finditer(ttext):
code_words.add(match.group(0))
# Ugh - not nice or fast.
slineno += ttext.count("\n")
return comments, code_words
def extract_py_comments(filepath: str) -> Tuple[List[Comment], Set[str]]:
import token
import tokenize
source = open(filepath, encoding='utf-8')
comments = []
code_words = set()
prev_toktype = token.INDENT
tokgen = tokenize.generate_tokens(source.readline)
for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
if toktype == token.STRING:
if prev_toktype == token.INDENT:
comments.append(Comment(filepath, ttext, slineno, 'DOCSTRING'))
elif toktype == tokenize.COMMENT:
# non standard hint for commented CODE that we can ignore
if not ttext.startswith("#~"):
comments.append(Comment(filepath, ttext, slineno, 'COMMENT'))
else:
for match in re_vars.finditer(ttext):
code_words.add(match.group(0))
prev_toktype = toktype
return comments, code_words
def extract_c_comments(filepath: str) -> Tuple[List[Comment], Set[str]]:
"""
Extracts comments like this:
/*
* This is a multi-line comment, notice the '*'s are aligned.
*/
"""
text = open(filepath, encoding='utf-8').read()
BEGIN = "/*"
END = "*/"
TABSIZE = 4
SINGLE_LINE = False
SKIP_COMMENTS = (
# GPL header.
"This program is free software; you can",
# reverse these to find blocks we won't parse
PRINT_NON_ALIGNED = False
PRINT_SPELLING = True
comment_ranges = []
i = 0
while i != -1:
i = text.find(BEGIN, i)
if i != -1:
i_next = text.find(END, i)
if i_next != -1:
# Not essential but seek back to find beginning of line.
while i > 0 and text[i - 1] in {"\t", " "}:
i -= 1
i_next += len(END)
comment_ranges.append((i, i_next))
i = i_next
else:
pass
# Collect variables from code, so we can reference variables from code blocks
# without this generating noise from the spell checker.
code_ranges = []
if not comment_ranges:
code_ranges.append((0, len(text)))
else:
for index in range(len(comment_ranges) + 1):
if index == 0:
i_prev = 0
else:
i_prev = comment_ranges[index - 1][1]
if index == len(comment_ranges):
i_next = len(text)
else:
i_next = comment_ranges[index][0]
code_ranges.append((i_prev, i_next))
code_words = set()
for i, i_next in code_ranges:
for match in re_vars.finditer(text[i:i_next]):
code_words.add(match.group(0))
# Allow plurals of these variables too.
code_words.add(match.group(0) + "'s")
comments = []
slineno = 0
i_prev = 0
for i, i_next in comment_ranges:
ok = True
block = text[i:i_next]
for c in SKIP_COMMENTS:
if c in block:
ok = False
break
if not ok:
continue
# Add white-space in front of the block (for alignment test)
# allow for -1 being not found, which results as zero.
j = text.rfind("\n", 0, i) + 1
block = (" " * (i - j)) + block
slineno += text.count("\n", i_prev, i)
comments.append(Comment(filepath, block, slineno, 'COMMENT'))
i_prev = i
return comments, code_words
def spell_check_report(filepath: str, report: Report) -> None:
w, slineno, scol = report
w_lower = w.lower()
if ONLY_ONCE:
if w_lower in _words_visited:
return
else:
_words_visited.add(w_lower)
suggest = _suggest_map.get(w_lower)
if suggest is None:
_suggest_map[w_lower] = suggest = " ".join(dict_spelling.suggest(w))
print("%s:%d:%d: %s%s%s, suggest (%s)" % (
filepath,
slineno + 1,
scol + 1,
COLOR_WORD,
w,
COLOR_ENDC,
suggest,
))
def spell_check_file(
filepath: str,
check_type: str = 'COMMENTS',
) -> Generator[Report, None, None]:
if check_type == 'COMMENTS':
if filepath.endswith(".py"):
comment_list, code_words = extract_py_comments(filepath)
else:
comment_list, code_words = extract_c_comments(filepath)
elif check_type == 'STRINGS':
comment_list, code_words = extract_code_strings(filepath)
for comment in comment_list:
for w, pos in comment.parse():
w_lower = w.lower()
if w_lower in dict_custom or w_lower in dict_ignore:
continue
if not dict_spelling.check(w):
# Ignore literals that show up in code,
# gets rid of a lot of noise from comments that reference variables.
if w in code_words:
# print("Skipping", w)
continue
slineno, scol = comment.line_and_column_from_comment_offset(pos)
yield (w, slineno, scol)
def spell_check_file_recursive(
dirpath: str,
check_type: str = 'COMMENTS',
cache_data: Optional[CacheData]=None,
) -> None:
import os
from os.path import join, splitext
def source_list(path: str, filename_check: Optional[Callable[[str], bool]]=None) -> Generator[str, None, None]:
for dirpath, dirnames, filenames in os.walk(path):
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
for filename in filenames:
if filename.startswith("."):
continue
filepath = join(dirpath, filename)
if filename_check is None or filename_check(filepath):
yield filepath
def is_source(filename: str) -> bool:
ext = splitext(filename)[1]
".inl",
".cpp",
".cxx",
".hpp",
".hxx",
".h",
".hh",
".m",
".mm",
".osl",
".py",
for filepath in source_list(dirpath, is_source):
for report in spell_check_file_with_cache_support(filepath, check_type=check_type, cache_data=cache_data):
spell_check_report(filepath, report)
# -----------------------------------------------------------------------------
# Cache File Support
#
# Cache is formatted as follows:
# (
# # Store all misspelled words.
# {filepath: (size, sha512, [reports, ...])},
#
# # Store suggestions, as these are slow to re-calculate.
# {lowercase_words: suggestions},
# )
#
def spell_cache_read(cache_filepath: str) -> Tuple[CacheData, SuggestMap]:
import pickle
cache_store: Tuple[CacheData, SuggestMap] = {}, {}
if os.path.exists(cache_filepath):
with open(cache_filepath, 'rb') as fh:
cache_store = pickle.load(fh)
return cache_store
def spell_cache_write(cache_filepath: str, cache_store: Tuple[CacheData, SuggestMap]) -> None:
import pickle
with open(cache_filepath, 'wb') as fh:
pickle.dump(cache_store, fh)
def spell_check_file_with_cache_support(
filepath: str,
check_type: str = 'COMMENTS',
cache_data: Optional[CacheData] = None,
) -> Generator[Report, None, None]:
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
"""
Iterator each item is a report: (word, line_number, column_number)
"""
_files_visited.add(filepath)
if cache_data is None:
yield from spell_check_file(filepath, check_type=check_type)
return
cache_data_for_file = cache_data.get(filepath)
if cache_data_for_file and len(cache_data_for_file) != 3:
cache_data_for_file = None
cache_hash_test, cache_len_test = hash_of_file_and_len(filepath)
if cache_data_for_file is not None:
cache_len, cache_hash, cache_reports = cache_data_for_file
if cache_len_test == cache_len:
if cache_hash_test == cache_hash:
if VERBOSE_CACHE:
print("Using cache for:", filepath)
yield from cache_reports
return
cache_reports = []
for report in spell_check_file(filepath, check_type=check_type):
cache_reports.append(report)
cache_data[filepath] = (cache_len_test, cache_hash_test, cache_reports)
yield from cache_reports
# -----------------------------------------------------------------------------
# Extract Bad Spelling from a Source File
# -----------------------------------------------------------------------------
# Main & Argument Parsing
def argparse_create() -> argparse.ArgumentParser:
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
# When --help or no args are given, print this help
description = __doc__
parser = argparse.ArgumentParser(description=description)
parser.add_argument(
'--extract',
dest='extract',
choices=('COMMENTS', 'STRINGS'),
default='COMMENTS',
required=False,
metavar='WHAT',
help=(
'Text to extract for checking.\n'
'\n'
'- ``COMMENTS`` extracts comments from source code.\n'
'- ``STRINGS`` extracts text.'
),
)
parser.add_argument(
"--cache-file",
dest="cache_file",
help=(
"Optional cache, for fast re-execution, "
"avoiding re-extracting spelling when files have not been modified."
),
required=False,
)
parser.add_argument(
"paths",
nargs='+',
help="Files or directories to walk recursively.",
)
return parser
global _suggest_map
import os
args = argparse_create().parse_args()
check_type = args.extract
cache_filepath = args.cache_file
cache_data: Optional[CacheData] = None
if cache_filepath:
cache_data, _suggest_map = spell_cache_read(cache_filepath)
clear_stale_cache = True
# print(check_type)
for filepath in args.paths:
if os.path.isdir(filepath):
# recursive search
spell_check_file_recursive(filepath, check_type=check_type, cache_data=cache_data)
else:
# single file
for report in spell_check_file_with_cache_support(
filepath, check_type=check_type, cache_data=cache_data):
spell_check_report(filepath, report)
except KeyboardInterrupt:
clear_stale_cache = False
if cache_filepath:
assert(cache_data is not None)
if VERBOSE_CACHE:
print("Writing cache:", len(cache_data))
if clear_stale_cache:
# Don't keep suggestions for old misspellings.
_suggest_map = {w_lower: _suggest_map[w_lower] for w_lower in _words_visited}
for filepath in list(cache_data.keys()):
if filepath not in _files_visited:
del cache_data[filepath]
spell_cache_write(cache_filepath, (cache_data, _suggest_map))
if __name__ == "__main__":
main()