dotfiles/.config/sublime-text-3/Packages/mdpopups/tests/spellcheck.py

267 lines
7.6 KiB
Python
Raw Normal View History

2018-05-16 00:30:13 +02:00
"""Spell check with aspell."""
from __future__ import unicode_literals
import subprocess
import os
import sys
import codecs
import bs4
import yaml
import re
from collections import namedtuple
PY3 = sys.version_info >= (3, 0)
def yaml_load(source, loader=yaml.Loader):
"""
Wrap PyYaml's loader so we can extend it to suit our needs.
Load all strings as unicode: http://stackoverflow.com/a/2967461/3609487.
"""
def construct_yaml_str(self, node):
"""Override the default string handling function to always return Unicode objects."""
return self.construct_scalar(node)
class Loader(loader):
"""Define a custom loader to leave the global loader unaltered."""
# Attach our unicode constructor to our custom loader ensuring all strings
# will be unicode on translation.
Loader.add_constructor('tag:yaml.org,2002:str', construct_yaml_str)
return yaml.load(source, Loader)
def read_config(file_name):
"""Read configuration."""
config = {}
with codecs.open(file_name, 'r', encoding='utf-8') as f:
config = yaml_load(f.read())
return config
def console(cmd, input_file=None, input_text=None):
"""Call with arguments."""
returncode = None
output = None
if sys.platform.startswith('win'):
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
process = subprocess.Popen(
cmd,
startupinfo=startupinfo,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
stdin=subprocess.PIPE,
shell=False
)
else:
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
stdin=subprocess.PIPE,
shell=False
)
if input_file is not None:
with open(input_file, 'rb') as f:
process.stdin.write(f.read())
if input_text is not None:
process.stdin.write(input_text)
output = process.communicate()
returncode = process.returncode
assert returncode == 0, "Runtime Error: %s" % (
output[0].rstrip().decode('utf-8') if PY3 else output[0]
)
return output[0].decode('utf-8') if PY3 else output[0]
class IgnoreRule (namedtuple('IgnoreRule', ['tag', 'id', 'classes'])):
"""Ignore rule."""
class Spelling(object):
"""Spell check object."""
DICTIONARY = 'dictionary.bin'
RE_SELECTOR = re.compile(r'(\#|\.)?[-\w]+')
def __init__(self, config_file):
"""Initialize."""
config = read_config(config_file)
self.docs = config.get('docs', [])
self.dictionary = ('\n'.join(config.get('dictionary', []))).encode('utf-8')
self.attributes = set(config.get('attributes', []))
self.ignores = self.ignore_rules(*config.get('ignores', []))
self.dict_bin = os.path.abspath(self.DICTIONARY)
def ignore_rules(self, *args):
"""
Process ignore rules.
Split ignore selector string into tag, id, and classes.
"""
ignores = []
for arg in args:
selector = arg.lower()
tag = None
tag_id = None
classes = set()
for m in self.RE_SELECTOR.finditer(selector):
selector = m.group(0)
if selector.startswith('.'):
classes.add(selector[1:])
elif selector.startswith('#') and tag_id is None:
tag_id = selector[1:]
elif tag is None:
tag = selector
else:
raise ValueError('Bad selector!')
if tag or tag_id or classes:
ignores.append(IgnoreRule(tag, tag_id, tuple(classes)))
return ignores
def compile_dictionaries(self):
"""Compile user dictionary."""
if os.path.exists(self.dict_bin):
os.remove(self.dict_bin)
print("Compiling Dictionary...")
print(
console(
[
'aspell',
'--lang=en',
'--encoding=utf-8',
'create',
'master',
os.path.abspath(self.dict_bin)
],
input_text=self.dictionary
)
)
def skip_tag(self, el):
"""Determine if tag should be skipped."""
skip = False
for rule in self.ignores:
if rule.tag and el.name.lower() != rule.tag:
continue
if rule.id and rule.id != el.attrs.get('id', '').lower():
continue
if rule.classes:
current_classes = [c.lower() for c in el.attrs.get('class', [])]
found = True
for c in rule.classes:
if c not in current_classes:
found = False
break
if not found:
continue
skip = True
break
return skip
def html_to_text(self, tree, root=True):
"""
Parse the HTML creating a buffer with each tags content.
Skip any selectors specified and include attributes if specified.
Ignored tags will not have their attributes scanned either.
"""
text = []
if not self.skip_tag(tree):
for attr in self.attributes:
value = tree.attrs.get(attr)
if value:
text.append(value)
for child in tree:
if isinstance(child, bs4.element.Tag):
if child.contents:
text.extend(self.html_to_text(child, False))
else:
text.append(str(child))
return ' '.join(text) if root else text
def check_spelling(self, html_file):
"""Check spelling."""
fail = False
with codecs.open(html_file, 'r', encoding='utf-8') as file_obj:
html = bs4.BeautifulSoup(file_obj.read(), "html5lib")
text = self.html_to_text(html.html)
wordlist = console(
[
'aspell',
'list',
'--lang=en',
'--mode=url',
'--encoding=utf-8',
'--extra-dicts',
self.dict_bin
],
input_text=text.encode('utf-8')
)
words = [w for w in sorted(set(wordlist.split('\n'))) if w]
if words:
fail = True
print('Misspelled words in %s' % html_file)
print('-' * 80)
for word in words:
print(word)
print('-' * 80)
print('\n')
return fail
def check(self):
"""Walk documents and initiate spell check."""
self.compile_dictionaries()
print('Spell Checking...')
fail = False
for doc in self.docs:
if os.path.isdir(doc):
for base, dirs, files in os.walk(doc):
# Remove child folders based on exclude rules
for f in files:
if f.lower().endswith('.html'):
file_name = os.path.join(base, f)
if self.check_spelling(file_name):
fail = True
elif doc.lower().endswith('.html'):
if self.check_spelling(doc):
fail = True
return fail
def main():
"""Main."""
spelling = Spelling('.spelling.yml')
return spelling.check()
if __name__ == "__main__":
sys.exit(main())