#!/usr/bin/env python3 # ##### BEGIN GPL LICENSE BLOCK ##### # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software Foundation, # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # # ##### END GPL LICENSE BLOCK ##### # <pep8 compliant> """ Script for checking source code spelling. python3 source/tools/check_source/check_spelling.py some_soure_file.py - Pass in a path for it to be checked recursively. - Pass in '--strings' to check strings instead of comments. Currently only python source is checked. """ import os import argparse from typing import ( Callable, Dict, Generator, List, Optional, Set, Tuple, ) # Report: word, line, column. Report = Tuple[str, int, int] # Cache: {filepath: length, hash, reports}. CacheData = Dict[str, Tuple[int, bytes, List[Report]]] # Map word to suggestions. SuggestMap = Dict[str, str] ONLY_ONCE = True USE_COLOR = True _words_visited = set() _files_visited = set() # Lowercase word -> suggestion list. _suggest_map: SuggestMap = {} VERBOSE_CACHE = False if USE_COLOR: COLOR_WORD = "\033[92m" COLOR_ENDC = "\033[0m" else: COLOR_WORD = "" COLOR_ENDC = "" import enchant dict_spelling = enchant.Dict("en_US") from check_spelling_c_config import ( dict_custom, dict_ignore, ) # ----------------------------------------------------------------------------- # General Utilities def hash_of_file_and_len(fp: str) -> Tuple[bytes, int]: import hashlib with open(fp, 'rb') as fh: data = fh.read() m = hashlib.sha512() m.update(data) return m.digest(), len(data) import re re_vars = re.compile("[A-Za-z]+") # First remove this from comments, so we don't spell check example code, doxygen commands, etc. re_ignore = re.compile( r'(' # URL. r'(https?|ftp)://\S+|' # Email address: <me@email.com> # <someone@foo.bar-baz.com> r"<\w+@[\w\.\-]+>|" # Convention for TODO/FIXME messages: TODO(my name) OR FIXME(name+name) OR XXX(some-name) OR NOTE(name/other-name): r"\b(TODO|FIXME|XXX|NOTE)\([A-Za-z\s\+\-/]+\)|" # Doxygen style: <pre> ... </pre> r"<pre>.+</pre>|" # Doxygen style: \code ... \endcode r"\s+\\code\b.+\s\\endcode\b|" # Doxygen style #SOME_CODE. r'#\S+|' # Doxygen commands: \param foo r"\\(section|subsection|subsubsection|ingroup|param|page|a|see)\s+\S+|" # Doxygen commands without any arguments after them: \command r"\\(retval|todo)\b|" # Doxygen 'param' syntax used rarely: \param foo[in,out] r"\\param\[[a-z,]+\]\S*|" # Words containing underscores: a_b r'\S*\w+_\S+|' # Words containing arrows: a->b r'\S*\w+\->\S+' # Words containing dot notation: a.b (NOT ab... since this is used in English). r'\w+\.\w+\S*|' # Single and back-tick quotes (often used to reference code). r"\s\`[^\n`]+\`|" r"\s'[^\n']+'" r')', re.MULTILINE | re.DOTALL, ) # Then extract words. re_words = re.compile( r"\b(" # Capital words, with optional '-' and "'". r"[A-Z]+[\-'A-Z]*[A-Z]|" # Lowercase words, with optional '-' and "'". r"[A-Za-z][\-'a-z]*[a-z]+" r")\b" ) re_not_newline = re.compile("[^\n]") def words_from_text(text: str) -> List[Tuple[str, int]]: """ Extract words to treat as English for spell checking. """ # Replace non-newlines with white-space, so all alignment is kept. def replace_ignore(match: re.Match[str]) -> str: start, end = match.span() return re_not_newline.sub(" ", match.string[start:end]) # Handy for checking what we ignore, incase we ignore too much and miss real errors. # for match in re_ignore.finditer(text): # print(match.group(0)) # Strip out URL's, code-blocks, etc. text = re_ignore.sub(replace_ignore, text) words = [] for match in re_words.finditer(text): words.append((match.group(0), match.start())) def word_ok(w: str) -> bool: # Ignore all uppercase words. if w.isupper(): return False return True words[:] = [w for w in words if word_ok(w[0])] return words class Comment: __slots__ = ( "file", "text", "line", "type", ) def __init__(self, file: str, text: str, line: int, type: str): self.file = file self.text = text self.line = line self.type = type def parse(self) -> List[Tuple[str, int]]: return words_from_text(self.text) def line_and_column_from_comment_offset(self, pos: int) -> Tuple[int, int]: text = self.text slineno = self.line + text.count("\n", 0, pos) # Allow for -1 to be not found. scol = text.rfind("\n", 0, pos) + 1 if scol == 0: # Not found. scol = pos else: scol = pos - scol return slineno, scol def extract_code_strings(filepath: str) -> Tuple[List[Comment], Set[str]]: import pygments from pygments import lexers from pygments.token import Token comments = [] code_words = set() # lex = lexers.find_lexer_class_for_filename(filepath) # if lex is None: # return comments, code_words if filepath.endswith(".py"): lex = lexers.get_lexer_by_name("python") else: lex = lexers.get_lexer_by_name("c") slineno = 0 with open(filepath, encoding='utf-8') as fh: source = fh.read() for ty, ttext in lex.get_tokens(source): if ty in {Token.Literal.String, Token.Literal.String.Double, Token.Literal.String.Single}: comments.append(Comment(filepath, ttext, slineno, 'STRING')) else: for match in re_vars.finditer(ttext): code_words.add(match.group(0)) # Ugh - not nice or fast. slineno += ttext.count("\n") return comments, code_words def extract_py_comments(filepath: str) -> Tuple[List[Comment], Set[str]]: import token import tokenize source = open(filepath, encoding='utf-8') comments = [] code_words = set() prev_toktype = token.INDENT tokgen = tokenize.generate_tokens(source.readline) for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen: if toktype == token.STRING: if prev_toktype == token.INDENT: comments.append(Comment(filepath, ttext, slineno, 'DOCSTRING')) elif toktype == tokenize.COMMENT: # non standard hint for commented CODE that we can ignore if not ttext.startswith("#~"): comments.append(Comment(filepath, ttext, slineno, 'COMMENT')) else: for match in re_vars.finditer(ttext): code_words.add(match.group(0)) prev_toktype = toktype return comments, code_words def extract_c_comments(filepath: str) -> Tuple[List[Comment], Set[str]]: """ Extracts comments like this: /* * This is a multi-line comment, notice the '*'s are aligned. */ """ text = open(filepath, encoding='utf-8').read() BEGIN = "/*" END = "*/" TABSIZE = 4 SINGLE_LINE = False SKIP_COMMENTS = ( # GPL header. "This program is free software; you can", ) # reverse these to find blocks we won't parse PRINT_NON_ALIGNED = False PRINT_SPELLING = True comment_ranges = [] i = 0 while i != -1: i = text.find(BEGIN, i) if i != -1: i_next = text.find(END, i) if i_next != -1: # Not essential but seek back to find beginning of line. while i > 0 and text[i - 1] in {"\t", " "}: i -= 1 i_next += len(END) comment_ranges.append((i, i_next)) i = i_next else: pass # Collect variables from code, so we can reference variables from code blocks # without this generating noise from the spell checker. code_ranges = [] if not comment_ranges: code_ranges.append((0, len(text))) else: for index in range(len(comment_ranges) + 1): if index == 0: i_prev = 0 else: i_prev = comment_ranges[index - 1][1] if index == len(comment_ranges): i_next = len(text) else: i_next = comment_ranges[index][0] code_ranges.append((i_prev, i_next)) code_words = set() for i, i_next in code_ranges: for match in re_vars.finditer(text[i:i_next]): code_words.add(match.group(0)) # Allow plurals of these variables too. code_words.add(match.group(0) + "'s") comments = [] slineno = 0 i_prev = 0 for i, i_next in comment_ranges: ok = True block = text[i:i_next] for c in SKIP_COMMENTS: if c in block: ok = False break if not ok: continue # Add white-space in front of the block (for alignment test) # allow for -1 being not found, which results as zero. j = text.rfind("\n", 0, i) + 1 block = (" " * (i - j)) + block slineno += text.count("\n", i_prev, i) comments.append(Comment(filepath, block, slineno, 'COMMENT')) i_prev = i return comments, code_words def spell_check_report(filepath: str, report: Report) -> None: w, slineno, scol = report w_lower = w.lower() if ONLY_ONCE: if w_lower in _words_visited: return else: _words_visited.add(w_lower) suggest = _suggest_map.get(w_lower) if suggest is None: _suggest_map[w_lower] = suggest = " ".join(dict_spelling.suggest(w)) print("%s:%d:%d: %s%s%s, suggest (%s)" % ( filepath, slineno + 1, scol + 1, COLOR_WORD, w, COLOR_ENDC, suggest, )) def spell_check_file( filepath: str, check_type: str = 'COMMENTS', ) -> Generator[Report, None, None]: if check_type == 'COMMENTS': if filepath.endswith(".py"): comment_list, code_words = extract_py_comments(filepath) else: comment_list, code_words = extract_c_comments(filepath) elif check_type == 'STRINGS': comment_list, code_words = extract_code_strings(filepath) for comment in comment_list: for w, pos in comment.parse(): w_lower = w.lower() if w_lower in dict_custom or w_lower in dict_ignore: continue if not dict_spelling.check(w): # Ignore literals that show up in code, # gets rid of a lot of noise from comments that reference variables. if w in code_words: # print("Skipping", w) continue slineno, scol = comment.line_and_column_from_comment_offset(pos) yield (w, slineno, scol) def spell_check_file_recursive( dirpath: str, check_type: str = 'COMMENTS', cache_data: Optional[CacheData]=None, ) -> None: import os from os.path import join, splitext def source_list(path: str, filename_check: Optional[Callable[[str], bool]]=None) -> Generator[str, None, None]: for dirpath, dirnames, filenames in os.walk(path): # skip '.git' dirnames[:] = [d for d in dirnames if not d.startswith(".")] for filename in filenames: if filename.startswith("."): continue filepath = join(dirpath, filename) if filename_check is None or filename_check(filepath): yield filepath def is_source(filename: str) -> bool: ext = splitext(filename)[1] return (ext in { ".c", ".cc", ".inl", ".cpp", ".cxx", ".hpp", ".hxx", ".h", ".hh", ".m", ".mm", ".osl", ".py", }) for filepath in source_list(dirpath, is_source): for report in spell_check_file_with_cache_support(filepath, check_type=check_type, cache_data=cache_data): spell_check_report(filepath, report) # ----------------------------------------------------------------------------- # Cache File Support # # Cache is formatted as follows: # ( # # Store all misspelled words. # {filepath: (size, sha512, [reports, ...])}, # # # Store suggestions, as these are slow to re-calculate. # {lowercase_words: suggestions}, # ) # def spell_cache_read(cache_filepath: str) -> Tuple[CacheData, SuggestMap]: import pickle cache_store: Tuple[CacheData, SuggestMap] = {}, {} if os.path.exists(cache_filepath): with open(cache_filepath, 'rb') as fh: cache_store = pickle.load(fh) return cache_store def spell_cache_write(cache_filepath: str, cache_store: Tuple[CacheData, SuggestMap]) -> None: import pickle with open(cache_filepath, 'wb') as fh: pickle.dump(cache_store, fh) def spell_check_file_with_cache_support( filepath: str, check_type: str = 'COMMENTS', cache_data: Optional[CacheData] = None, ) -> Generator[Report, None, None]: """ Iterator each item is a report: (word, line_number, column_number) """ _files_visited.add(filepath) if cache_data is None: yield from spell_check_file(filepath, check_type=check_type) return cache_data_for_file = cache_data.get(filepath) if cache_data_for_file and len(cache_data_for_file) != 3: cache_data_for_file = None cache_hash_test, cache_len_test = hash_of_file_and_len(filepath) if cache_data_for_file is not None: cache_len, cache_hash, cache_reports = cache_data_for_file if cache_len_test == cache_len: if cache_hash_test == cache_hash: if VERBOSE_CACHE: print("Using cache for:", filepath) yield from cache_reports return cache_reports = [] for report in spell_check_file(filepath, check_type=check_type): cache_reports.append(report) cache_data[filepath] = (cache_len_test, cache_hash_test, cache_reports) yield from cache_reports # ----------------------------------------------------------------------------- # Extract Bad Spelling from a Source File # ----------------------------------------------------------------------------- # Main & Argument Parsing def argparse_create() -> argparse.ArgumentParser: # When --help or no args are given, print this help description = __doc__ parser = argparse.ArgumentParser(description=description) parser.add_argument( '--extract', dest='extract', choices=('COMMENTS', 'STRINGS'), default='COMMENTS', required=False, metavar='WHAT', help=( 'Text to extract for checking.\n' '\n' '- ``COMMENTS`` extracts comments from source code.\n' '- ``STRINGS`` extracts text.' ), ) parser.add_argument( "--cache-file", dest="cache_file", help=( "Optional cache, for fast re-execution, " "avoiding re-extracting spelling when files have not been modified." ), required=False, ) parser.add_argument( "paths", nargs='+', help="Files or directories to walk recursively.", ) return parser def main() -> None: global _suggest_map import os args = argparse_create().parse_args() check_type = args.extract cache_filepath = args.cache_file cache_data: Optional[CacheData] = None if cache_filepath: cache_data, _suggest_map = spell_cache_read(cache_filepath) clear_stale_cache = True # print(check_type) try: for filepath in args.paths: if os.path.isdir(filepath): # recursive search spell_check_file_recursive(filepath, check_type=check_type, cache_data=cache_data) else: # single file for report in spell_check_file_with_cache_support( filepath, check_type=check_type, cache_data=cache_data): spell_check_report(filepath, report) except KeyboardInterrupt: clear_stale_cache = False if cache_filepath: assert(cache_data is not None) if VERBOSE_CACHE: print("Writing cache:", len(cache_data)) if clear_stale_cache: # Don't keep suggestions for old misspellings. _suggest_map = {w_lower: _suggest_map[w_lower] for w_lower in _words_visited} for filepath in list(cache_data.keys()): if filepath not in _files_visited: del cache_data[filepath] spell_cache_write(cache_filepath, (cache_data, _suggest_map)) if __name__ == "__main__": main()