check_spelling: various improvements

- Use the custom dictionary when checking hyphenated words. - Add the custom dictionary to the existing dictionary so suggestions will include words from the custom dictionary. - Refactor spell checking functions into wrapper calls. Removes over 200 false positives.

check_spelling: various improvements
937a66e7 · Campbell Barton · f4a13d25 · 937a66e7 · 937a66e7
Commit 937a66e7 authored Oct 6, 2021 by Campbell Barton
--- a/check_source/check_spelling.py
+++ b/check_source/check_spelling.py
@@ -70,15 +70,59 @@ else:
    COLOR_WORD = ""
    COLOR_ENDC = ""

-
-import enchant
-dict_spelling = enchant.Dict("en_US")
-
 from check_spelling_c_config import (
    dict_custom,
    dict_ignore,
+    dict_ignore_hyphenated_prefix,
 )

+# -----------------------------------------------------------------------------
+# Dictionary Utilities
+
+def dictionary_create():  # type: ignore
+    import enchant  # type: ignore
+    dict_spelling = enchant.Dict("en_US")
+
+
+    # Don't add ignore to the dictionary, since they will be suggested.
+    for w in dict_custom:
+        dict_spelling.add(w)
+    return dict_spelling
+
+
+def dictionary_check(w: str) -> bool:
+    w_lower = w.lower()
+    if w_lower in dict_ignore:
+        return True
+
+    is_correct: bool = _dict.check(w)
+    # Split by hyphenation and check.
+    if not is_correct:
+        if "-" in w:
+            is_correct = True
+
+            # Allow: `un-word`, `re-word`.
+            w_split = w.strip("-").split("-")
+            if w_split and w_split[0].lower() in dict_ignore_hyphenated_prefix:
+                del w_split[0]
+
+            for w_sub in w_split:
+                if w_sub:
+                    w_sub_lower = w_sub.lower()
+                    if w_sub_lower in dict_ignore:
+                        continue
+                    if not _dict.check(w_sub):
+                        is_correct = False
+                        break
+    return is_correct
+
+
+def dictionary_suggest(w: str) -> List[str]:
+    return _dict.suggest(w)  # type: ignore
+
+
+_dict = dictionary_create()  # type: ignore
+

 # -----------------------------------------------------------------------------
 # General Utilities
@@ -207,9 +251,9 @@ class Comment:


 def extract_code_strings(filepath: str) -> Tuple[List[Comment], Set[str]]:
-    import pygments
+    import pygments  # type: ignore
    from pygments import lexers
-    from pygments.token import Token
+    from pygments.token import Token  # type: ignore

    comments = []
    code_words = set()
@@ -375,7 +419,7 @@ def spell_check_report(filepath: str, report: Report) -> None:

    suggest = _suggest_map.get(w_lower)
    if suggest is None:
-        _suggest_map[w_lower] = suggest = " ".join(dict_spelling.suggest(w))
+        _suggest_map[w_lower] = suggest = " ".join(dictionary_suggest(w))

    print("%s:%d:%d: %s%s%s, suggest (%s)" % (
        filepath,
@@ -403,19 +447,10 @@ def spell_check_file(
    for comment in comment_list:
        for w, pos in comment.parse():
            w_lower = w.lower()
-            if w_lower in dict_custom or w_lower in dict_ignore:
+            if w_lower in dict_ignore:
                continue

-            is_good_spelling = dict_spelling.check(w)
-            if not is_good_spelling:
-                if "-" in w:
-                    is_good_spelling = True
-                    for w_sub in w.split("-"):
-                        if w_sub:
-                            if not dict_spelling.check(w_sub):
-                                is_good_spelling = False
-                                break
-
+            is_good_spelling = dictionary_check(w)
            if not is_good_spelling:
                # Ignore literals that show up in code,
                # gets rid of a lot of noise from comments that reference variables.

--- a/check_source/check_spelling_c_config.py
+++ b/check_source/check_spelling_c_config.py
@@ -111,7 +111,7 @@ dict_custom = {
    "parameterization",
    "parentless",
    "passepartout",
-    "pixelated", "pixelisation",
+    "pixelate", "pixelated", "pixelisation",
    "planarity",
    "polytope",
    "postprocessed",
@@ -422,3 +422,14 @@ dict_ignore = {
    "bugprone-suspicious-enum-usage",
    "bugprone-use-after-move",
 }
+
+# Allow: `un-word`, `re-word` ... etc, in this case only check `word`.
+dict_ignore_hyphenated_prefix = {
+    "de",
+    "mis",
+    "non",
+    "post",
+    "pre",
+    "re",
+    "un",
+}