267 lines
7.6 KiB
Python
267 lines
7.6 KiB
Python
|
"""Spell check with aspell."""
|
||
|
from __future__ import unicode_literals
|
||
|
import subprocess
|
||
|
import os
|
||
|
import sys
|
||
|
import codecs
|
||
|
import bs4
|
||
|
import yaml
|
||
|
import re
|
||
|
from collections import namedtuple
|
||
|
|
||
|
PY3 = sys.version_info >= (3, 0)
|
||
|
|
||
|
|
||
|
def yaml_load(source, loader=yaml.Loader):
|
||
|
"""
|
||
|
Wrap PyYaml's loader so we can extend it to suit our needs.
|
||
|
|
||
|
Load all strings as unicode: http://stackoverflow.com/a/2967461/3609487.
|
||
|
"""
|
||
|
|
||
|
def construct_yaml_str(self, node):
|
||
|
"""Override the default string handling function to always return Unicode objects."""
|
||
|
return self.construct_scalar(node)
|
||
|
|
||
|
class Loader(loader):
|
||
|
"""Define a custom loader to leave the global loader unaltered."""
|
||
|
|
||
|
# Attach our unicode constructor to our custom loader ensuring all strings
|
||
|
# will be unicode on translation.
|
||
|
Loader.add_constructor('tag:yaml.org,2002:str', construct_yaml_str)
|
||
|
|
||
|
return yaml.load(source, Loader)
|
||
|
|
||
|
|
||
|
def read_config(file_name):
|
||
|
"""Read configuration."""
|
||
|
|
||
|
config = {}
|
||
|
with codecs.open(file_name, 'r', encoding='utf-8') as f:
|
||
|
config = yaml_load(f.read())
|
||
|
return config
|
||
|
|
||
|
|
||
|
def console(cmd, input_file=None, input_text=None):
|
||
|
"""Call with arguments."""
|
||
|
|
||
|
returncode = None
|
||
|
output = None
|
||
|
|
||
|
if sys.platform.startswith('win'):
|
||
|
startupinfo = subprocess.STARTUPINFO()
|
||
|
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
|
||
|
process = subprocess.Popen(
|
||
|
cmd,
|
||
|
startupinfo=startupinfo,
|
||
|
stdout=subprocess.PIPE,
|
||
|
stderr=subprocess.STDOUT,
|
||
|
stdin=subprocess.PIPE,
|
||
|
shell=False
|
||
|
)
|
||
|
else:
|
||
|
process = subprocess.Popen(
|
||
|
cmd,
|
||
|
stdout=subprocess.PIPE,
|
||
|
stderr=subprocess.STDOUT,
|
||
|
stdin=subprocess.PIPE,
|
||
|
shell=False
|
||
|
)
|
||
|
|
||
|
if input_file is not None:
|
||
|
with open(input_file, 'rb') as f:
|
||
|
process.stdin.write(f.read())
|
||
|
if input_text is not None:
|
||
|
process.stdin.write(input_text)
|
||
|
output = process.communicate()
|
||
|
returncode = process.returncode
|
||
|
|
||
|
assert returncode == 0, "Runtime Error: %s" % (
|
||
|
output[0].rstrip().decode('utf-8') if PY3 else output[0]
|
||
|
)
|
||
|
|
||
|
return output[0].decode('utf-8') if PY3 else output[0]
|
||
|
|
||
|
|
||
|
class IgnoreRule (namedtuple('IgnoreRule', ['tag', 'id', 'classes'])):
|
||
|
"""Ignore rule."""
|
||
|
|
||
|
|
||
|
class Spelling(object):
|
||
|
"""Spell check object."""
|
||
|
|
||
|
DICTIONARY = 'dictionary.bin'
|
||
|
RE_SELECTOR = re.compile(r'(\#|\.)?[-\w]+')
|
||
|
|
||
|
def __init__(self, config_file):
|
||
|
"""Initialize."""
|
||
|
|
||
|
config = read_config(config_file)
|
||
|
self.docs = config.get('docs', [])
|
||
|
self.dictionary = ('\n'.join(config.get('dictionary', []))).encode('utf-8')
|
||
|
self.attributes = set(config.get('attributes', []))
|
||
|
self.ignores = self.ignore_rules(*config.get('ignores', []))
|
||
|
self.dict_bin = os.path.abspath(self.DICTIONARY)
|
||
|
|
||
|
def ignore_rules(self, *args):
|
||
|
"""
|
||
|
Process ignore rules.
|
||
|
|
||
|
Split ignore selector string into tag, id, and classes.
|
||
|
"""
|
||
|
|
||
|
ignores = []
|
||
|
|
||
|
for arg in args:
|
||
|
selector = arg.lower()
|
||
|
tag = None
|
||
|
tag_id = None
|
||
|
classes = set()
|
||
|
|
||
|
for m in self.RE_SELECTOR.finditer(selector):
|
||
|
selector = m.group(0)
|
||
|
if selector.startswith('.'):
|
||
|
classes.add(selector[1:])
|
||
|
elif selector.startswith('#') and tag_id is None:
|
||
|
tag_id = selector[1:]
|
||
|
elif tag is None:
|
||
|
tag = selector
|
||
|
else:
|
||
|
raise ValueError('Bad selector!')
|
||
|
|
||
|
if tag or tag_id or classes:
|
||
|
ignores.append(IgnoreRule(tag, tag_id, tuple(classes)))
|
||
|
|
||
|
return ignores
|
||
|
|
||
|
def compile_dictionaries(self):
|
||
|
"""Compile user dictionary."""
|
||
|
|
||
|
if os.path.exists(self.dict_bin):
|
||
|
os.remove(self.dict_bin)
|
||
|
print("Compiling Dictionary...")
|
||
|
print(
|
||
|
console(
|
||
|
[
|
||
|
'aspell',
|
||
|
'--lang=en',
|
||
|
'--encoding=utf-8',
|
||
|
'create',
|
||
|
'master',
|
||
|
os.path.abspath(self.dict_bin)
|
||
|
],
|
||
|
input_text=self.dictionary
|
||
|
)
|
||
|
)
|
||
|
|
||
|
def skip_tag(self, el):
|
||
|
"""Determine if tag should be skipped."""
|
||
|
|
||
|
skip = False
|
||
|
for rule in self.ignores:
|
||
|
if rule.tag and el.name.lower() != rule.tag:
|
||
|
continue
|
||
|
if rule.id and rule.id != el.attrs.get('id', '').lower():
|
||
|
continue
|
||
|
if rule.classes:
|
||
|
current_classes = [c.lower() for c in el.attrs.get('class', [])]
|
||
|
found = True
|
||
|
for c in rule.classes:
|
||
|
if c not in current_classes:
|
||
|
found = False
|
||
|
break
|
||
|
if not found:
|
||
|
continue
|
||
|
skip = True
|
||
|
break
|
||
|
return skip
|
||
|
|
||
|
def html_to_text(self, tree, root=True):
|
||
|
"""
|
||
|
Parse the HTML creating a buffer with each tags content.
|
||
|
|
||
|
Skip any selectors specified and include attributes if specified.
|
||
|
Ignored tags will not have their attributes scanned either.
|
||
|
"""
|
||
|
|
||
|
text = []
|
||
|
|
||
|
if not self.skip_tag(tree):
|
||
|
for attr in self.attributes:
|
||
|
value = tree.attrs.get(attr)
|
||
|
if value:
|
||
|
text.append(value)
|
||
|
|
||
|
for child in tree:
|
||
|
if isinstance(child, bs4.element.Tag):
|
||
|
if child.contents:
|
||
|
text.extend(self.html_to_text(child, False))
|
||
|
else:
|
||
|
text.append(str(child))
|
||
|
|
||
|
return ' '.join(text) if root else text
|
||
|
|
||
|
def check_spelling(self, html_file):
|
||
|
"""Check spelling."""
|
||
|
|
||
|
fail = False
|
||
|
with codecs.open(html_file, 'r', encoding='utf-8') as file_obj:
|
||
|
html = bs4.BeautifulSoup(file_obj.read(), "html5lib")
|
||
|
text = self.html_to_text(html.html)
|
||
|
|
||
|
wordlist = console(
|
||
|
[
|
||
|
'aspell',
|
||
|
'list',
|
||
|
'--lang=en',
|
||
|
'--mode=url',
|
||
|
'--encoding=utf-8',
|
||
|
'--extra-dicts',
|
||
|
self.dict_bin
|
||
|
],
|
||
|
input_text=text.encode('utf-8')
|
||
|
)
|
||
|
words = [w for w in sorted(set(wordlist.split('\n'))) if w]
|
||
|
|
||
|
if words:
|
||
|
fail = True
|
||
|
print('Misspelled words in %s' % html_file)
|
||
|
print('-' * 80)
|
||
|
for word in words:
|
||
|
print(word)
|
||
|
print('-' * 80)
|
||
|
print('\n')
|
||
|
return fail
|
||
|
|
||
|
def check(self):
|
||
|
"""Walk documents and initiate spell check."""
|
||
|
|
||
|
self.compile_dictionaries()
|
||
|
|
||
|
print('Spell Checking...')
|
||
|
fail = False
|
||
|
for doc in self.docs:
|
||
|
if os.path.isdir(doc):
|
||
|
for base, dirs, files in os.walk(doc):
|
||
|
# Remove child folders based on exclude rules
|
||
|
for f in files:
|
||
|
if f.lower().endswith('.html'):
|
||
|
file_name = os.path.join(base, f)
|
||
|
if self.check_spelling(file_name):
|
||
|
fail = True
|
||
|
elif doc.lower().endswith('.html'):
|
||
|
if self.check_spelling(doc):
|
||
|
fail = True
|
||
|
return fail
|
||
|
|
||
|
|
||
|
def main():
|
||
|
"""Main."""
|
||
|
|
||
|
spelling = Spelling('.spelling.yml')
|
||
|
return spelling.check()
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
sys.exit(main())
|