diff --git a/check_source/check_spelling.py b/check_source/check_spelling.py index e4cc495a42dc527bdf1148af57ee3f3e1a7144ad..3e3db6c63d1e80bd7bd195d48fec25b8b21991c5 100755 --- a/check_source/check_spelling.py +++ b/check_source/check_spelling.py @@ -53,6 +53,9 @@ from check_spelling_c_config import ( ) +import re +re_vars = re.compile("[A-Za-z]+") + def words_from_text(text): """ Extract words to treat as English for spell checking. """ @@ -144,19 +147,25 @@ def extract_py_comments(filepath): source = open(filepath, encoding='utf-8') comments = [] + code_words = set() prev_toktype = token.INDENT tokgen = tokenize.generate_tokens(source.readline) for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen: - if toktype == token.STRING and prev_toktype == token.INDENT: - comments.append(Comment(filepath, ttext, slineno, 'DOCSTRING')) + if toktype == token.STRING: + if prev_toktype == token.INDENT: + comments.append(Comment(filepath, ttext, slineno, 'DOCSTRING')) elif toktype == tokenize.COMMENT: # non standard hint for commented CODE that we can ignore if not ttext.startswith("#~"): comments.append(Comment(filepath, ttext, slineno, 'COMMENT')) + else: + for match in re_vars.finditer(ttext): + code_words.add(match.group(0)) + prev_toktype = toktype - return comments + return comments, code_words def extract_c_comments(filepath): @@ -167,7 +176,6 @@ def extract_c_comments(filepath): * This is a multi-line comment, notice the '*'s are aligned. */ """ - i = 0 text = open(filepath, encoding='utf-8').read() BEGIN = "/*" @@ -210,85 +218,115 @@ def extract_c_comments(filepath): break block_split[i] = l - comments = [] + comment_ranges = [] - while i >= 0: + i = 0 + while i != -1: i = text.find(BEGIN, i) if i != -1: i_next = text.find(END, i) if i_next != -1: - - # not essential but seek ack to find beginning of line + # Not essential but seek back to find beginning of line. while i > 0 and text[i - 1] in {"\t", " "}: i -= 1 + i_next += len(END) + comment_ranges.append((i, i_next)) + i = i_next + else: + pass - block = text[i:i_next + len(END)] + # Collect variables from code, so we can reference variables from code blocks + # without this generating noise from the spell checker. - # add whitespace in front of the block (for alignment test) - ws = [] - j = i - while j > 0 and text[j - 1] != "\n": - ws .append("\t" if text[j - 1] == "\t" else " ") - j -= 1 - ws.reverse() - block = "".join(ws) + block + code_ranges = [] + if not comment_ranges: + code_ranges.append((0, len(text))) + else: + for index in range(len(comment_ranges) + 1): + if index == 0: + i_prev = 0 + else: + i_prev = comment_ranges[index - 1][1] - ok = True + if index == len(comment_ranges): + i_next = len(text) + else: + i_next = comment_ranges[index][0] - if not (SINGLE_LINE or ("\n" in block)): - ok = False + code_ranges.append((i_prev, i_next)) - if ok: - for c in SKIP_COMMENTS: - if c in block: - ok = False - break + code_words = set() - if ok: - # expand tabs - block_split = [l.expandtabs(TABSIZE) for l in block.split("\n")] + for i, i_next in code_ranges: + for match in re_vars.finditer(text[i:i_next]): + code_words.add(match.group(0)) - # now validate that the block is aligned - align_vals = tuple(sorted(set([l.find("*") for l in block_split]))) - is_aligned = len(align_vals) == 1 + comments = [] - if is_aligned: - if PRINT_SPELLING: - if STRIP_DOXY: - strip_doxy_comments(block_split) + for i, i_next in comment_ranges: + block = text[i:i_next] - align = align_vals[0] + 1 - block = "\n".join([l[align:] for l in block_split])[:-len(END)] + # add whitespace in front of the block (for alignment test) + ws = [] + j = i + while j > 0 and text[j - 1] != "\n": + ws .append("\t" if text[j - 1] == "\t" else " ") + j -= 1 + ws.reverse() + block = "".join(ws) + block - # now strip block and get text - # print(block) + ok = True - # ugh - not nice or fast - slineno = 1 + text.count("\n", 0, i) + if not (SINGLE_LINE or ("\n" in block)): + ok = False - comments.append(Comment(filepath, block, slineno, 'COMMENT')) - else: - if PRINT_NON_ALIGNED: - lineno = 1 + text.count("\n", 0, i) - if PRINT_QTC_TASKFORMAT: - filepath = os.path.abspath(filepath) - print("%s\t%d\t%s\t%s" % (filepath, lineno, "comment", align_vals)) - else: - print(filepath + ":" + str(lineno) + ":") + if ok: + for c in SKIP_COMMENTS: + if c in block: + ok = False + break - i = i_next - else: - pass + if ok: + # expand tabs + block_split = [l.expandtabs(TABSIZE) for l in block.split("\n")] + + # now validate that the block is aligned + align_vals = tuple(sorted(set([l.find("*") for l in block_split]))) + is_aligned = len(align_vals) == 1 + + if is_aligned: + if PRINT_SPELLING: + if STRIP_DOXY: + strip_doxy_comments(block_split) - return comments + align = align_vals[0] + 1 + block = "\n".join([l[align:] for l in block_split])[:-len(END)] + + # now strip block and get text + # print(block) + + # ugh - not nice or fast + slineno = 1 + text.count("\n", 0, i) + + comments.append(Comment(filepath, block, slineno, 'COMMENT')) + else: + if PRINT_NON_ALIGNED: + lineno = 1 + text.count("\n", 0, i) + if PRINT_QTC_TASKFORMAT: + filepath = os.path.abspath(filepath) + print("%s\t%d\t%s\t%s" % (filepath, lineno, "comment", align_vals)) + else: + print(filepath + ":" + str(lineno) + ":") + + return comments, code_words def spell_check_comments(filepath): if filepath.endswith(".py"): - comment_list = extract_py_comments(filepath) + comment_list, code_words = extract_py_comments(filepath) else: - comment_list = extract_c_comments(filepath) + comment_list, code_words = extract_c_comments(filepath) for comment in comment_list: for w in comment.parse(): @@ -301,6 +339,12 @@ def spell_check_comments(filepath): if not dict_spelling.check(w): + # Ignore literals that show up in code, + # gets rid of a lot of noise from comments that reference variables. + if w in code_words: + # print("Skipping", w) + continue + if ONLY_ONCE: if w_lower in _only_once_ids: continue