check_spelling: Scan code for terms & skip reporting these

Comments often refer to structs & variable names, skip these when reporting spelling errors.

check_spelling: Scan code for terms & skip reporting these
adb5af0c · Campbell Barton · aa9cc189 · adb5af0c
Commit adb5af0c authored Aug 11, 2019 by Campbell Barton
--- a/check_source/check_spelling.py
+++ b/check_source/check_spelling.py
@@ -53,6 +53,9 @@ from check_spelling_c_config import (
 )
+import re
+re_vars = re.compile("[A-Za-z]+")
 def words_from_text(text):
    """ Extract words to treat as English for spell checking.
    """
@@ -144,19 +147,25 @@ def extract_py_comments(filepath):
    source = open(filepath, encoding='utf-8')
    comments = []
+    code_words = set()
    prev_toktype = token.INDENT
    tokgen = tokenize.generate_tokens(source.readline)
    for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
-        if toktype == token.STRING and prev_toktype == token.INDENT:
+        if toktype == token.STRING:
+            if prev_toktype == token.INDENT:
                comments.append(Comment(filepath, ttext, slineno, 'DOCSTRING'))
        elif toktype == tokenize.COMMENT:
            # non standard hint for commented CODE that we can ignore
            if not ttext.startswith("#~"):
                comments.append(Comment(filepath, ttext, slineno, 'COMMENT'))
+        else:
+            for match in re_vars.finditer(ttext):
+                code_words.add(match.group(0))
        prev_toktype = toktype
-    return comments
+    return comments, code_words
 def extract_c_comments(filepath):
@@ -167,7 +176,6 @@ def extract_c_comments(filepath):
         * This is a multi-line comment, notice the '*'s are aligned.
         */
    """
-    i = 0
    text = open(filepath, encoding='utf-8').read()
    BEGIN = "/*"
@@ -210,19 +218,53 @@ def extract_c_comments(filepath):
                    break
            block_split[i] = l
-    comments = []
+    comment_ranges = []
-    while i >= 0:
+    i = 0
+    while i != -1:
        i = text.find(BEGIN, i)
        if i != -1:
            i_next = text.find(END, i)
            if i_next != -1:
+                # Not essential but seek back to find beginning of line.
-                # not essential but seek ack to find beginning of line
                while i > 0 and text[i - 1] in {"\t", " "}:
                    i -= 1
+                i_next += len(END)
+                comment_ranges.append((i, i_next))
+            i = i_next
+        else:
+            pass
-                block = text[i:i_next + len(END)]
+    # Collect variables from code, so we can reference variables from code blocks
+    # without this generating noise from the spell checker.
+    code_ranges = []
+    if not comment_ranges:
+        code_ranges.append((0, len(text)))
+    else:
+        for index in range(len(comment_ranges) + 1):
+            if index == 0:
+                i_prev = 0
+            else:
+                i_prev = comment_ranges[index - 1][1]
+            if index == len(comment_ranges):
+                i_next = len(text)
+            else:
+                i_next = comment_ranges[index][0]
+            code_ranges.append((i_prev, i_next))
+    code_words = set()
+    for i, i_next in code_ranges:
+        for match in re_vars.finditer(text[i:i_next]):
+            code_words.add(match.group(0))
+    comments = []
+    for i, i_next in comment_ranges:
+        block = text[i:i_next]
        # add whitespace in front of the block (for alignment test)
        ws = []
@@ -276,19 +318,15 @@ def extract_c_comments(filepath):
                    else:
                        print(filepath + ":" + str(lineno) + ":")
-            i = i_next
+    return comments, code_words
-        else:
-            pass
-    return comments
 def spell_check_comments(filepath):
    if filepath.endswith(".py"):
-        comment_list = extract_py_comments(filepath)
+        comment_list, code_words = extract_py_comments(filepath)
    else:
-        comment_list = extract_c_comments(filepath)
+        comment_list, code_words = extract_c_comments(filepath)
    for comment in comment_list:
        for w in comment.parse():
@@ -301,6 +339,12 @@ def spell_check_comments(filepath):
            if not dict_spelling.check(w):
+                # Ignore literals that show up in code,
+                # gets rid of a lot of noise from comments that reference variables.
+                if w in code_words:
+                    # print("Skipping", w)
+                    continue
                if ONLY_ONCE:
                    if w_lower in _only_once_ids:
                        continue