check_spelling: Scan code for terms & skip reporting these

Comments often refer to structs & variable names, skip these when reporting spelling errors.

check_spelling: Scan code for terms & skip reporting these
adb5af0c · Campbell Barton · aa9cc189 · adb5af0c
Commit adb5af0c authored 5 years ago by Campbell Barton
--- a/check_source/check_spelling.py
+++ b/check_source/check_spelling.py
@@ -53,6 +53,9 @@ from check_spelling_c_config import (
 )


+import re
+re_vars = re.compile("[A-Za-z]+")
+
 def words_from_text(text):
    """ Extract words to treat as English for spell checking.
    """
@@ -144,19 +147,25 @@ def extract_py_comments(filepath):
    source = open(filepath, encoding='utf-8')

    comments = []
+    code_words = set()

    prev_toktype = token.INDENT

    tokgen = tokenize.generate_tokens(source.readline)
    for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
-        if toktype == token.STRING and prev_toktype == token.INDENT:
-            comments.append(Comment(filepath, ttext, slineno, 'DOCSTRING'))
+        if toktype == token.STRING:
+            if prev_toktype == token.INDENT:
+                comments.append(Comment(filepath, ttext, slineno, 'DOCSTRING'))
        elif toktype == tokenize.COMMENT:
            # non standard hint for commented CODE that we can ignore
            if not ttext.startswith("#~"):
                comments.append(Comment(filepath, ttext, slineno, 'COMMENT'))
+        else:
+            for match in re_vars.finditer(ttext):
+                code_words.add(match.group(0))
+
        prev_toktype = toktype
-    return comments
+    return comments, code_words


 def extract_c_comments(filepath):
@@ -167,7 +176,6 @@ def extract_c_comments(filepath):
         * This is a multi-line comment, notice the '*'s are aligned.
         */
    """
-    i = 0
    text = open(filepath, encoding='utf-8').read()

    BEGIN = "/*"
@@ -210,85 +218,115 @@ def extract_c_comments(filepath):
                    break
            block_split[i] = l

-    comments = []
+    comment_ranges = []

-    while i >= 0:
+    i = 0
+    while i != -1:
        i = text.find(BEGIN, i)
        if i != -1:
            i_next = text.find(END, i)
            if i_next != -1:
-
-                # not essential but seek ack to find beginning of line
+                # Not essential but seek back to find beginning of line.
                while i > 0 and text[i - 1] in {"\t", " "}:
                    i -= 1
+                i_next += len(END)
+                comment_ranges.append((i, i_next))
+            i = i_next
+        else:
+            pass

-                block = text[i:i_next + len(END)]
+    # Collect variables from code, so we can reference variables from code blocks
+    # without this generating noise from the spell checker.

-                # add whitespace in front of the block (for alignment test)
-                ws = []
-                j = i
-                while j > 0 and text[j - 1] != "\n":
-                    ws .append("\t" if text[j - 1] == "\t" else " ")
-                    j -= 1
-                ws.reverse()
-                block = "".join(ws) + block
+    code_ranges = []
+    if not comment_ranges:
+        code_ranges.append((0, len(text)))
+    else:
+        for index in range(len(comment_ranges) + 1):
+            if index == 0:
+                i_prev = 0
+            else:
+                i_prev = comment_ranges[index - 1][1]

-                ok = True
+            if index == len(comment_ranges):
+                i_next = len(text)
+            else:
+                i_next = comment_ranges[index][0]

-                if not (SINGLE_LINE or ("\n" in block)):
-                    ok = False
+            code_ranges.append((i_prev, i_next))

-                if ok:
-                    for c in SKIP_COMMENTS:
-                        if c in block:
-                            ok = False
-                            break
+    code_words = set()

-                if ok:
-                    # expand tabs
-                    block_split = [l.expandtabs(TABSIZE) for l in block.split("\n")]
+    for i, i_next in code_ranges:
+        for match in re_vars.finditer(text[i:i_next]):
+            code_words.add(match.group(0))

-                    # now validate that the block is aligned
-                    align_vals = tuple(sorted(set([l.find("*") for l in block_split])))
-                    is_aligned = len(align_vals) == 1
+    comments = []

-                    if is_aligned:
-                        if PRINT_SPELLING:
-                            if STRIP_DOXY:
-                                strip_doxy_comments(block_split)
+    for i, i_next in comment_ranges:
+        block = text[i:i_next]

-                            align = align_vals[0] + 1
-                            block = "\n".join([l[align:] for l in block_split])[:-len(END)]
+        # add whitespace in front of the block (for alignment test)
+        ws = []
+        j = i
+        while j > 0 and text[j - 1] != "\n":
+            ws .append("\t" if text[j - 1] == "\t" else " ")
+            j -= 1
+        ws.reverse()
+        block = "".join(ws) + block

-                            # now strip block and get text
-                            # print(block)
+        ok = True

-                            # ugh - not nice or fast
-                            slineno = 1 + text.count("\n", 0, i)
+        if not (SINGLE_LINE or ("\n" in block)):
+            ok = False

-                            comments.append(Comment(filepath, block, slineno, 'COMMENT'))
-                    else:
-                        if PRINT_NON_ALIGNED:
-                            lineno = 1 + text.count("\n", 0, i)
-                            if PRINT_QTC_TASKFORMAT:
-                                filepath = os.path.abspath(filepath)
-                                print("%s\t%d\t%s\t%s" % (filepath, lineno, "comment", align_vals))
-                            else:
-                                print(filepath + ":" + str(lineno) + ":")
+        if ok:
+            for c in SKIP_COMMENTS:
+                if c in block:
+                    ok = False
+                    break

-            i = i_next
-        else:
-            pass
+        if ok:
+            # expand tabs
+            block_split = [l.expandtabs(TABSIZE) for l in block.split("\n")]
+
+            # now validate that the block is aligned
+            align_vals = tuple(sorted(set([l.find("*") for l in block_split])))
+            is_aligned = len(align_vals) == 1
+
+            if is_aligned:
+                if PRINT_SPELLING:
+                    if STRIP_DOXY:
+                        strip_doxy_comments(block_split)

-    return comments
+                    align = align_vals[0] + 1
+                    block = "\n".join([l[align:] for l in block_split])[:-len(END)]
+
+                    # now strip block and get text
+                    # print(block)
+
+                    # ugh - not nice or fast
+                    slineno = 1 + text.count("\n", 0, i)
+
+                    comments.append(Comment(filepath, block, slineno, 'COMMENT'))
+            else:
+                if PRINT_NON_ALIGNED:
+                    lineno = 1 + text.count("\n", 0, i)
+                    if PRINT_QTC_TASKFORMAT:
+                        filepath = os.path.abspath(filepath)
+                        print("%s\t%d\t%s\t%s" % (filepath, lineno, "comment", align_vals))
+                    else:
+                        print(filepath + ":" + str(lineno) + ":")
+
+    return comments, code_words


 def spell_check_comments(filepath):

    if filepath.endswith(".py"):
-        comment_list = extract_py_comments(filepath)
+        comment_list, code_words = extract_py_comments(filepath)
    else:
-        comment_list = extract_c_comments(filepath)
+        comment_list, code_words = extract_c_comments(filepath)

    for comment in comment_list:
        for w in comment.parse():
@@ -301,6 +339,12 @@ def spell_check_comments(filepath):

            if not dict_spelling.check(w):

+                # Ignore literals that show up in code,
+                # gets rid of a lot of noise from comments that reference variables.
+                if w in code_words:
+                    # print("Skipping", w)
+                    continue
+
                if ONLY_ONCE:
                    if w_lower in _only_once_ids:
                        continue