Skip to content
Snippets Groups Projects
check_spelling.py 17.3 KiB
Newer Older
  • Learn to ignore specific revisions
  • # ##### BEGIN GPL LICENSE BLOCK #####
    #
    #  This program is free software; you can redistribute it and/or
    #  modify it under the terms of the GNU General Public License
    #  as published by the Free Software Foundation; either version 2
    #  of the License, or (at your option) any later version.
    #
    #  This program is distributed in the hope that it will be useful,
    #  but WITHOUT ANY WARRANTY; without even the implied warranty of
    #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    #  GNU General Public License for more details.
    #
    #  You should have received a copy of the GNU General Public License
    #  along with this program; if not, write to the Free Software Foundation,
    #  Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
    #
    # ##### END GPL LICENSE BLOCK #####
    
    # <pep8 compliant>
    
    """
    Script for checking source code spelling.
    
    
       python3 source/tools/check_source/check_spelling.py some_soure_file.py
    
    - Pass in a path for it to be checked recursively.
    - Pass in '--strings' to check strings instead of comments.
    
    
    Currently only python source is checked.
    """
    
    import os
    
    import argparse
    
    from typing import (
        Callable,
        Dict,
        Generator,
        List,
        Optional,
        Set,
        Tuple,
    )
    
    
    # Report: word, line, column.
    Report = Tuple[str, int, int]
    # Cache: {filepath: length, hash, reports}.
    CacheData = Dict[str, Tuple[int, bytes, List[Report]]]
    # Map word to suggestions.
    SuggestMap = Dict[str, str]
    
    
    ONLY_ONCE = True
    USE_COLOR = True
    
    
    _words_visited = set()
    _files_visited = set()
    
    # Lowercase word -> suggestion list.
    
    _suggest_map: SuggestMap = {}
    
    
    if USE_COLOR:
        COLOR_WORD = "\033[92m"
        COLOR_ENDC = "\033[0m"
    else:
    
    Campbell Barton's avatar
    Campbell Barton committed
        COLOR_WORD = ""
    
        COLOR_ENDC = ""
    
    
    import enchant
    dict_spelling = enchant.Dict("en_US")
    
    
    from check_spelling_c_config import (
    
    Campbell Barton's avatar
    Campbell Barton committed
        dict_custom,
        dict_ignore,
    )
    
    # -----------------------------------------------------------------------------
    # General Utilities
    
    
    def hash_of_file_and_len(fp: str) -> Tuple[bytes, int]:
    
        import hashlib
        with open(fp, 'rb') as fh:
            data = fh.read()
    
            m = hashlib.sha512()
            m.update(data)
            return m.digest(), len(data)
    
    import re
    re_vars = re.compile("[A-Za-z]+")
    
    
    # First remove this from comments, so we don't spell check example code, doxygen commands, etc.
    
        # Email address: <me@email.com>
        #                <someone@foo.bar-baz.com>
        r"<\w+@[\w\.\-]+>|"
    
        # Convention for TODO/FIXME messages: TODO(my name) OR FIXME(name+name) OR XXX(some-name) OR NOTE(name/other-name):
        r"\b(TODO|FIXME|XXX|NOTE)\([A-Za-z\s\+\-/]+\)|"
    
        # Doxygen style: <pre> ... </pre>
        r"<pre>.+</pre>|"
        # Doxygen style: \code ... \endcode
        r"\s+\\code\b.+\s\\endcode\b|"
        # Doxygen style #SOME_CODE.
        r'#\S+|'
        # Doxygen commands: \param foo
    
        r"\\(section|subsection|subsubsection|ingroup|param|page|a|see)\s+\S+|"
        # Doxygen commands without any arguments after them: \command
        r"\\(retval|todo)\b|"
    
        # Doxygen 'param' syntax used rarely: \param foo[in,out]
        r"\\param\[[a-z,]+\]\S*|"
    
        # Words containing underscores: a_b
        r'\S*\w+_\S+|'
        # Words containing arrows: a->b
        r'\S*\w+\->\S+'
        # Words containing dot notation: a.b  (NOT  ab... since this is used in English).
        r'\w+\.\w+\S*|'
    
        # Single and back-tick quotes (often used to reference code).
        r"\s\`[^\n`]+\`|"
        r"\s'[^\n']+'"
    
    # Then extract words.
    re_words = re.compile(
        r"\b("
        # Capital words, with optional '-' and "'".
        r"[A-Z]+[\-'A-Z]*[A-Z]|"
        # Lowercase words, with optional '-' and "'".
        r"[A-Za-z][\-'a-z]*[a-z]+"
        r")\b"
    )
    
    def words_from_text(text: str) -> List[Tuple[str, int]]:
    
        """ Extract words to treat as English for spell checking.
        """
    
        # Replace non-newlines with white-space, so all alignment is kept.
    
        def replace_ignore(match: re.Match[str]) -> str:
    
            start, end = match.span()
            return re_not_newline.sub(" ", match.string[start:end])
    
        # Handy for checking what we ignore, incase we ignore too much and miss real errors.
        # for match in re_ignore.finditer(text):
        #     print(match.group(0))
    
        # Strip out URL's, code-blocks, etc.
        text = re_ignore.sub(replace_ignore, text)
    
        words = []
        for match in re_words.finditer(text):
            words.append((match.group(0), match.start()))
    
        def word_ok(w: str) -> bool:
    
            if w.isupper():
                return False
            return True
    
        words[:] = [w for w in words if word_ok(w[0])]
    
        __slots__ = (
            "file",
            "text",
            "line",
            "type",
    
    Campbell Barton's avatar
    Campbell Barton committed
        )
    
        def __init__(self, file: str, text: str, line: int, type: str):
    
            self.file = file
            self.text = text
            self.line = line
            self.type = type
    
    
        def parse(self) -> List[Tuple[str, int]]:
    
            return words_from_text(self.text)
    
    
        def line_and_column_from_comment_offset(self, pos: int) -> Tuple[int, int]:
    
            text = self.text
            slineno = self.line + text.count("\n", 0, pos)
            # Allow for -1 to be not found.
            scol = text.rfind("\n", 0, pos) + 1
            if scol == 0:
                # Not found.
                scol = pos
            else:
                scol = pos - scol
            return slineno, scol
    
    
    def extract_code_strings(filepath: str) -> Tuple[List[Comment], Set[str]]:
    
        import pygments
        from pygments import lexers
        from pygments.token import Token
    
        comments = []
        code_words = set()
    
        # lex = lexers.find_lexer_class_for_filename(filepath)
        # if lex is None:
        #     return comments, code_words
        if filepath.endswith(".py"):
            lex = lexers.get_lexer_by_name("python")
        else:
            lex = lexers.get_lexer_by_name("c")
    
    
        with open(filepath, encoding='utf-8') as fh:
            source = fh.read()
    
        for ty, ttext in lex.get_tokens(source):
            if ty in {Token.Literal.String, Token.Literal.String.Double, Token.Literal.String.Single}:
                comments.append(Comment(filepath, ttext, slineno, 'STRING'))
            else:
                for match in re_vars.finditer(ttext):
                    code_words.add(match.group(0))
            # Ugh - not nice or fast.
            slineno += ttext.count("\n")
    
        return comments, code_words
    
    
    
    def extract_py_comments(filepath: str) -> Tuple[List[Comment], Set[str]]:
    
    
        import token
        import tokenize
    
        source = open(filepath, encoding='utf-8')
    
        comments = []
    
    
        prev_toktype = token.INDENT
    
        tokgen = tokenize.generate_tokens(source.readline)
        for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
    
            if toktype == token.STRING:
                if prev_toktype == token.INDENT:
                    comments.append(Comment(filepath, ttext, slineno, 'DOCSTRING'))
    
            elif toktype == tokenize.COMMENT:
                # non standard hint for commented CODE that we can ignore
                if not ttext.startswith("#~"):
                    comments.append(Comment(filepath, ttext, slineno, 'COMMENT'))
    
            else:
                for match in re_vars.finditer(ttext):
                    code_words.add(match.group(0))
    
    
            prev_toktype = toktype
    
    def extract_c_comments(filepath: str) -> Tuple[List[Comment], Set[str]]:
    
        """
        Extracts comments like this:
    
            /*
             * This is a multi-line comment, notice the '*'s are aligned.
             */
        """
        text = open(filepath, encoding='utf-8').read()
    
        BEGIN = "/*"
        END = "*/"
        TABSIZE = 4
        SINGLE_LINE = False
        SKIP_COMMENTS = (
    
            # GPL header.
            "This program is free software; you can",
    
    Campbell Barton's avatar
    Campbell Barton committed
        )
    
    
        # reverse these to find blocks we won't parse
        PRINT_NON_ALIGNED = False
        PRINT_SPELLING = True
    
    
            i = text.find(BEGIN, i)
            if i != -1:
                i_next = text.find(END, i)
                if i_next != -1:
    
                    # Not essential but seek back to find beginning of line.
    
                    while i > 0 and text[i - 1] in {"\t", " "}:
                        i -= 1
    
                    i_next += len(END)
                    comment_ranges.append((i, i_next))
                i = i_next
            else:
                pass
    
        # Collect variables from code, so we can reference variables from code blocks
        # without this generating noise from the spell checker.
    
        code_ranges = []
        if not comment_ranges:
            code_ranges.append((0, len(text)))
        else:
            for index in range(len(comment_ranges) + 1):
                if index == 0:
                    i_prev = 0
                else:
                    i_prev = comment_ranges[index - 1][1]
    
                if index == len(comment_ranges):
                    i_next = len(text)
                else:
                    i_next = comment_ranges[index][0]
    
                code_ranges.append((i_prev, i_next))
    
        for i, i_next in code_ranges:
            for match in re_vars.finditer(text[i:i_next]):
                code_words.add(match.group(0))
    
                # Allow plurals of these variables too.
                code_words.add(match.group(0) + "'s")
    
        for i, i_next in comment_ranges:
    
            for c in SKIP_COMMENTS:
                if c in block:
                    ok = False
                    break
    
            # Add white-space in front of the block (for alignment test)
            # allow for -1 being not found, which results as zero.
            j = text.rfind("\n", 0, i) + 1
            block = (" " * (i - j)) + block
    
            slineno += text.count("\n", i_prev, i)
            comments.append(Comment(filepath, block, slineno, 'COMMENT'))
            i_prev = i
    
    def spell_check_report(filepath: str, report: Report) -> None:
    
        w, slineno, scol = report
        w_lower = w.lower()
    
        if ONLY_ONCE:
            if w_lower in _words_visited:
                return
            else:
                _words_visited.add(w_lower)
    
        suggest = _suggest_map.get(w_lower)
        if suggest is None:
            _suggest_map[w_lower] = suggest = " ".join(dict_spelling.suggest(w))
    
        print("%s:%d:%d: %s%s%s, suggest (%s)" % (
            filepath,
            slineno + 1,
            scol + 1,
            COLOR_WORD,
            w,
            COLOR_ENDC,
            suggest,
        ))
    
    def spell_check_file(
            filepath: str,
            check_type: str = 'COMMENTS',
    ) -> Generator[Report, None, None]:
    
        if check_type == 'COMMENTS':
            if filepath.endswith(".py"):
                comment_list, code_words = extract_py_comments(filepath)
            else:
                comment_list, code_words = extract_c_comments(filepath)
        elif check_type == 'STRINGS':
            comment_list, code_words = extract_code_strings(filepath)
    
    
        for comment in comment_list:
    
                w_lower = w.lower()
                if w_lower in dict_custom or w_lower in dict_ignore:
                    continue
    
                if not dict_spelling.check(w):
    
    
                    # Ignore literals that show up in code,
                    # gets rid of a lot of noise from comments that reference variables.
                    if w in code_words:
                        # print("Skipping", w)
                        continue
    
    
                    slineno, scol = comment.line_and_column_from_comment_offset(pos)
                    yield (w, slineno, scol)
    
    
    
    def spell_check_file_recursive(
            dirpath: str,
            check_type: str = 'COMMENTS',
            cache_data: Optional[CacheData]=None,
    ) -> None:
    
        from os.path import join, splitext
    
    
        def source_list(path: str, filename_check: Optional[Callable[[str], bool]]=None) -> Generator[str, None, None]:
    
            for dirpath, dirnames, filenames in os.walk(path):
    
                # skip '.git'
    
                dirnames[:] = [d for d in dirnames if not d.startswith(".")]
    
                for filename in filenames:
    
                    if filename.startswith("."):
                        continue
    
                    filepath = join(dirpath, filename)
                    if filename_check is None or filename_check(filepath):
                        yield filepath
    
    
        def is_source(filename: str) -> bool:
    
            ext = splitext(filename)[1]
    
            return (ext in {
                ".c",
    
                ".inl",
                ".cpp",
                ".cxx",
                ".hpp",
                ".hxx",
                ".h",
                ".hh",
                ".m",
                ".mm",
                ".osl",
                ".py",
    
    Campbell Barton's avatar
    Campbell Barton committed
            })
    
    
        for filepath in source_list(dirpath, is_source):
    
            for report in spell_check_file_with_cache_support(filepath, check_type=check_type, cache_data=cache_data):
                spell_check_report(filepath, report)
    
    # -----------------------------------------------------------------------------
    # Cache File Support
    #
    # Cache is formatted as follows:
    # (
    #     # Store all misspelled words.
    #     {filepath: (size, sha512, [reports, ...])},
    #
    #     # Store suggestions, as these are slow to re-calculate.
    #     {lowercase_words: suggestions},
    # )
    #
    
    def spell_cache_read(cache_filepath: str) -> Tuple[CacheData, SuggestMap]:
    
        cache_store: Tuple[CacheData, SuggestMap] = {}, {}
    
        if os.path.exists(cache_filepath):
            with open(cache_filepath, 'rb') as fh:
    
                cache_store = pickle.load(fh)
        return cache_store
    
    def spell_cache_write(cache_filepath: str, cache_store: Tuple[CacheData, SuggestMap]) -> None:
    
        import pickle
        with open(cache_filepath, 'wb') as fh:
    
            pickle.dump(cache_store, fh)
    
    def spell_check_file_with_cache_support(
            filepath: str,
            check_type: str = 'COMMENTS',
            cache_data: Optional[CacheData] = None,
    ) -> Generator[Report, None, None]:
    
        """
        Iterator each item is a report: (word, line_number, column_number)
        """
        _files_visited.add(filepath)
    
        if cache_data is None:
            yield from spell_check_file(filepath, check_type=check_type)
            return
    
        cache_data_for_file = cache_data.get(filepath)
        if cache_data_for_file and len(cache_data_for_file) != 3:
            cache_data_for_file = None
    
        cache_hash_test, cache_len_test = hash_of_file_and_len(filepath)
        if cache_data_for_file is not None:
            cache_len, cache_hash, cache_reports = cache_data_for_file
            if cache_len_test == cache_len:
                if cache_hash_test == cache_hash:
                    if VERBOSE_CACHE:
                        print("Using cache for:", filepath)
                    yield from cache_reports
                    return
    
        cache_reports = []
        for report in spell_check_file(filepath, check_type=check_type):
            cache_reports.append(report)
    
        cache_data[filepath] = (cache_len_test, cache_hash_test, cache_reports)
    
        yield from cache_reports
    
    
    # -----------------------------------------------------------------------------
    # Extract Bad Spelling from a Source File
    
    
    # -----------------------------------------------------------------------------
    # Main & Argument Parsing
    
    
    def argparse_create() -> argparse.ArgumentParser:
    
    
        # When --help or no args are given, print this help
        description = __doc__
        parser = argparse.ArgumentParser(description=description)
    
        parser.add_argument(
            '--extract',
            dest='extract',
            choices=('COMMENTS', 'STRINGS'),
            default='COMMENTS',
            required=False,
            metavar='WHAT',
            help=(
                'Text to extract for checking.\n'
                '\n'
                '- ``COMMENTS`` extracts comments from source code.\n'
                '- ``STRINGS`` extracts text.'
            ),
        )
    
        parser.add_argument(
            "--cache-file",
            dest="cache_file",
            help=(
                "Optional cache, for fast re-execution, "
                "avoiding re-extracting spelling when files have not been modified."
            ),
            required=False,
        )
    
        parser.add_argument(
            "paths",
            nargs='+',
            help="Files or directories to walk recursively.",
        )
    
        return parser
    
    
    
    def main() -> None:
    
        global _suggest_map
    
        import os
    
        args = argparse_create().parse_args()
    
        check_type = args.extract
        cache_filepath = args.cache_file
    
    
        cache_data: Optional[CacheData] = None
    
        if cache_filepath:
            cache_data, _suggest_map = spell_cache_read(cache_filepath)
            clear_stale_cache = True
    
        # print(check_type)
    
            for filepath in args.paths:
                if os.path.isdir(filepath):
                    # recursive search
                    spell_check_file_recursive(filepath, check_type=check_type, cache_data=cache_data)
                else:
                    # single file
                    for report in spell_check_file_with_cache_support(
                            filepath, check_type=check_type, cache_data=cache_data):
                        spell_check_report(filepath, report)
        except KeyboardInterrupt:
            clear_stale_cache = False
    
        if cache_filepath:
    
            assert(cache_data is not None)
    
            if VERBOSE_CACHE:
                print("Writing cache:", len(cache_data))
    
            if clear_stale_cache:
                # Don't keep suggestions for old misspellings.
                _suggest_map = {w_lower: _suggest_map[w_lower] for w_lower in _words_visited}
    
                for filepath in list(cache_data.keys()):
                    if filepath not in _files_visited:
                        del cache_data[filepath]
    
            spell_cache_write(cache_filepath, (cache_data, _suggest_map))
    
    
    if __name__ == "__main__":
        main()