Skip to content
Snippets Groups Projects
check_spelling.py 10.3 KiB
Newer Older
  • Learn to ignore specific revisions
  • # ##### BEGIN GPL LICENSE BLOCK #####
    #
    #  This program is free software; you can redistribute it and/or
    #  modify it under the terms of the GNU General Public License
    #  as published by the Free Software Foundation; either version 2
    #  of the License, or (at your option) any later version.
    #
    #  This program is distributed in the hope that it will be useful,
    #  but WITHOUT ANY WARRANTY; without even the implied warranty of
    #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    #  GNU General Public License for more details.
    #
    #  You should have received a copy of the GNU General Public License
    #  along with this program; if not, write to the Free Software Foundation,
    #  Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
    #
    # ##### END GPL LICENSE BLOCK #####
    
    # <pep8 compliant>
    
    """
    Script for checking source code spelling.
    
       python3 source/tools/check_source/check_spelling_c.py some_soure_file.py
    
    
    Currently only python source is checked.
    """
    
    import os
    PRINT_QTC_TASKFORMAT = False
    if "USE_QTC_TASK" in os.environ:
        PRINT_QTC_TASKFORMAT = True
    
    ONLY_ONCE = True
    USE_COLOR = True
    _only_once_ids = set()
    
    if USE_COLOR:
        COLOR_WORD = "\033[92m"
        COLOR_ENDC = "\033[0m"
    else:
        COLOR_FAIL = ""
        COLOR_ENDC = ""
    
    
    import enchant
    dict_spelling = enchant.Dict("en_US")
    
    
    from check_spelling_c_config import (
    
    Campbell Barton's avatar
    Campbell Barton committed
        dict_custom,
        dict_ignore,
    )
    
    
    
    def words_from_text(text):
        """ Extract words to treat as English for spell checking.
        """
        text = text.strip("#'\"")
        text = text.replace("/", " ")
        text = text.replace("-", " ")
        text = text.replace("+", " ")
        text = text.replace("%", " ")
        text = text.replace(",", " ")
        text = text.replace("=", " ")
        text = text.replace("|", " ")
        words = text.split()
    
        # filter words
        words[:] = [w.strip("*?!:;.,'\"`") for w in words]
    
        def word_ok(w):
            # check for empty string
            if not w:
                return False
    
            # ignore all uppercase words
            if w.isupper():
                return False
    
            # check for string with no characters in it
            is_alpha = False
            for c in w:
                if c.isalpha():
                    is_alpha = True
                    break
            if not is_alpha:
                return False
    
            # check for prefix/suffix which render this not a real word
            # example '--debug', '\n'
            # TODO, add more
            if w[0] in "%-+\\@":
                return False
    
            # check for code in comments
            for c in "<>{}[]():._0123456789\&*":
                if c in w:
                    return False
    
            # check for words which contain lower case but have upper case
            # ending chars eg - 'StructRNA', we can ignore these.
            if len(w) > 1:
                has_lower = False
                for c in w:
                    if c.islower():
                        has_lower = True
                        break
                if has_lower and (not w[1:].islower()):
                    return False
    
            return True
        words[:] = [w for w in words if word_ok(w)]
    
        # text = " ".join(words)
    
        # print(text)
        return words
    
    
    class Comment:
    
        __slots__ = (
            "file",
            "text",
            "line",
            "type",
    
    Campbell Barton's avatar
    Campbell Barton committed
        )
    
    
        def __init__(self, file, text, line, type):
            self.file = file
            self.text = text
            self.line = line
            self.type = type
    
        def parse(self):
            return words_from_text(self.text)
    
    
    def extract_py_comments(filepath):
    
        import token
        import tokenize
    
        source = open(filepath, encoding='utf-8')
    
        comments = []
    
        prev_toktype = token.INDENT
    
        tokgen = tokenize.generate_tokens(source.readline)
        for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
            if toktype == token.STRING and prev_toktype == token.INDENT:
                comments.append(Comment(filepath, ttext, slineno, 'DOCSTRING'))
            elif toktype == tokenize.COMMENT:
                # non standard hint for commented CODE that we can ignore
                if not ttext.startswith("#~"):
                    comments.append(Comment(filepath, ttext, slineno, 'COMMENT'))
            prev_toktype = toktype
        return comments
    
    
    def extract_c_comments(filepath):
        """
        Extracts comments like this:
    
            /*
             * This is a multi-line comment, notice the '*'s are aligned.
             */
        """
        i = 0
        text = open(filepath, encoding='utf-8').read()
    
        BEGIN = "/*"
        END = "*/"
        TABSIZE = 4
        SINGLE_LINE = False
        STRIP_DOXY = True
        STRIP_DOXY_DIRECTIVES = (
            r"\section",
            r"\subsection",
            r"\subsubsection",
            r"\ingroup",
    
            r"\param[in]",
            r"\param[out]",
            r"\param[in,out]",
    
    Campbell Barton's avatar
    Campbell Barton committed
        )
    
        SKIP_COMMENTS = (
            "BEGIN GPL LICENSE BLOCK",
    
    Campbell Barton's avatar
    Campbell Barton committed
        )
    
    
        # http://doc.qt.nokia.com/qtcreator-2.4/creator-task-lists.html#task-list-file-format
        # file\tline\ttype\tdescription
        # ... > foobar.tasks
    
        # reverse these to find blocks we won't parse
        PRINT_NON_ALIGNED = False
        PRINT_SPELLING = True
    
        def strip_doxy_comments(block_split):
    
            for i, l in enumerate(block_split):
                for directive in STRIP_DOXY_DIRECTIVES:
                    if directive in l:
                        l_split = l.split()
                        l_split[l_split.index(directive) + 1] = " "
                        l = " ".join(l_split)
                        del l_split
                        break
                block_split[i] = l
    
        comments = []
    
        while i >= 0:
            i = text.find(BEGIN, i)
            if i != -1:
                i_next = text.find(END, i)
                if i_next != -1:
    
                    # not essential but seek ack to find beginning of line
                    while i > 0 and text[i - 1] in {"\t", " "}:
                        i -= 1
    
                    block = text[i:i_next + len(END)]
    
                    # add whitespace in front of the block (for alignment test)
                    ws = []
                    j = i
                    while j > 0 and text[j - 1] != "\n":
                        ws .append("\t" if text[j - 1] == "\t" else " ")
                        j -= 1
                    ws.reverse()
                    block = "".join(ws) + block
    
                    ok = True
    
                    if not (SINGLE_LINE or ("\n" in block)):
                        ok = False
    
                    if ok:
                        for c in SKIP_COMMENTS:
                            if c in block:
                                ok = False
                                break
    
                    if ok:
                        # expand tabs
                        block_split = [l.expandtabs(TABSIZE) for l in block.split("\n")]
    
                        # now validate that the block is aligned
                        align_vals = tuple(sorted(set([l.find("*") for l in block_split])))
                        is_aligned = len(align_vals) == 1
    
                        if is_aligned:
                            if PRINT_SPELLING:
                                if STRIP_DOXY:
                                    strip_doxy_comments(block_split)
    
                                align = align_vals[0] + 1
                                block = "\n".join([l[align:] for l in block_split])[:-len(END)]
    
                                # now strip block and get text
                                # print(block)
    
                                # ugh - not nice or fast
                                slineno = 1 + text.count("\n", 0, i)
    
                                comments.append(Comment(filepath, block, slineno, 'COMMENT'))
                        else:
                            if PRINT_NON_ALIGNED:
                                lineno = 1 + text.count("\n", 0, i)
                                if PRINT_QTC_TASKFORMAT:
    
                                    filepath = os.path.abspath(filepath)
    
                                    print("%s\t%d\t%s\t%s" % (filepath, lineno, "comment", align_vals))
                                else:
                                    print(filepath + ":" + str(lineno) + ":")
    
                i = i_next
            else:
                pass
    
        return comments
    
    
    def spell_check_comments(filepath):
    
        if filepath.endswith(".py"):
            comment_list = extract_py_comments(filepath)
        else:
            comment_list = extract_c_comments(filepath)
    
        for comment in comment_list:
            for w in comment.parse():
    
    Campbell Barton's avatar
    Campbell Barton committed
                # if len(w) < 15:
                #     continue
    
    
                w_lower = w.lower()
                if w_lower in dict_custom or w_lower in dict_ignore:
                    continue
    
                if not dict_spelling.check(w):
    
                    if ONLY_ONCE:
                        if w_lower in _only_once_ids:
                            continue
                        else:
                            _only_once_ids.add(w_lower)
    
                    if PRINT_QTC_TASKFORMAT:
                        print("%s\t%d\t%s\t%s, suggest (%s)" %
                              (comment.file,
                               comment.line,
                               "comment",
                               w,
                               " ".join(dict_spelling.suggest(w)),
                               ))
                    else:
                        print("%s:%d: %s%s%s, suggest (%s)" %
                              (comment.file,
                               comment.line,
                               COLOR_WORD,
                               w,
                               COLOR_ENDC,
                               " ".join(dict_spelling.suggest(w)),
                               ))
    
    
    def spell_check_comments_recursive(dirpath):
        from os.path import join, splitext
    
        def source_list(path, filename_check=None):
            for dirpath, dirnames, filenames in os.walk(path):
    
    
                # skip '.git'
    
                if dirpath.startswith("."):
                    continue
    
                for filename in filenames:
                    filepath = join(dirpath, filename)
                    if filename_check is None or filename_check(filepath):
                        yield filepath
    
        def is_source(filename):
            ext = splitext(filename)[1]
    
            return (ext in {
                ".c",
                ".inl",
                ".cpp",
                ".cxx",
                ".hpp",
                ".hxx",
                ".h",
                ".hh",
                ".m",
                ".mm",
                ".osl",
                ".py",
    
    Campbell Barton's avatar
    Campbell Barton committed
            })
    
    
        for filepath in source_list(dirpath, is_source):
            spell_check_comments(filepath)
    
    
    import sys
    import os
    
    if __name__ == "__main__":
        for filepath in sys.argv[1:]:
            if os.path.isdir(filepath):
                # recursive search
                spell_check_comments_recursive(filepath)
            else:
                # single file
                spell_check_comments(filepath)