dotfiles/.config/sublime-text-3/Packages/mdpopups/tests/spellcheck.py

"""Spell check with aspell."""
from __future__ import unicode_literals
import subprocess
import os
import sys
import codecs
import bs4
import yaml
import re
from collections import namedtuple

PY3 = sys.version_info >= (3, 0)


def yaml_load(source, loader=yaml.Loader):
    """
    Wrap PyYaml's loader so we can extend it to suit our needs.

    Load all strings as unicode: http://stackoverflow.com/a/2967461/3609487.
    """

    def construct_yaml_str(self, node):
        """Override the default string handling function to always return Unicode objects."""
        return self.construct_scalar(node)

    class Loader(loader):
        """Define a custom loader to leave the global loader unaltered."""

    # Attach our unicode constructor to our custom loader ensuring all strings
    # will be unicode on translation.
    Loader.add_constructor('tag:yaml.org,2002:str', construct_yaml_str)

    return yaml.load(source, Loader)


def read_config(file_name):
    """Read configuration."""

    config = {}
    with codecs.open(file_name, 'r', encoding='utf-8') as f:
        config = yaml_load(f.read())
    return config


def console(cmd, input_file=None, input_text=None):
    """Call with arguments."""

    returncode = None
    output = None

    if sys.platform.startswith('win'):
        startupinfo = subprocess.STARTUPINFO()
        startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
        process = subprocess.Popen(
            cmd,
            startupinfo=startupinfo,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            stdin=subprocess.PIPE,
            shell=False
        )
    else:
        process = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            stdin=subprocess.PIPE,
            shell=False
        )

    if input_file is not None:
        with open(input_file, 'rb') as f:
            process.stdin.write(f.read())
    if input_text is not None:
        process.stdin.write(input_text)
    output = process.communicate()
    returncode = process.returncode

    assert returncode == 0, "Runtime Error: %s" % (
        output[0].rstrip().decode('utf-8') if PY3 else output[0]
    )

    return output[0].decode('utf-8') if PY3 else output[0]


class IgnoreRule (namedtuple('IgnoreRule', ['tag', 'id', 'classes'])):
    """Ignore rule."""


class Spelling(object):
    """Spell check object."""

    DICTIONARY = 'dictionary.bin'
    RE_SELECTOR = re.compile(r'(\#|\.)?[-\w]+')

    def __init__(self, config_file):
        """Initialize."""

        config = read_config(config_file)
        self.docs = config.get('docs', [])
        self.dictionary = ('\n'.join(config.get('dictionary', []))).encode('utf-8')
        self.attributes = set(config.get('attributes', []))
        self.ignores = self.ignore_rules(*config.get('ignores', []))
        self.dict_bin = os.path.abspath(self.DICTIONARY)

    def ignore_rules(self, *args):
        """
        Process ignore rules.

        Split ignore selector string into tag, id, and classes.
        """

        ignores = []

        for arg in args:
            selector = arg.lower()
            tag = None
            tag_id = None
            classes = set()

            for m in self.RE_SELECTOR.finditer(selector):
                selector = m.group(0)
                if selector.startswith('.'):
                    classes.add(selector[1:])
                elif selector.startswith('#') and tag_id is None:
                    tag_id = selector[1:]
                elif tag is None:
                    tag = selector
                else:
                    raise ValueError('Bad selector!')

            if tag or tag_id or classes:
                ignores.append(IgnoreRule(tag, tag_id, tuple(classes)))

        return ignores

    def compile_dictionaries(self):
        """Compile user dictionary."""

        if os.path.exists(self.dict_bin):
            os.remove(self.dict_bin)
        print("Compiling Dictionary...")
        print(
            console(
                [
                    'aspell',
                    '--lang=en',
                    '--encoding=utf-8',
                    'create',
                    'master',
                    os.path.abspath(self.dict_bin)
                ],
                input_text=self.dictionary
            )
        )

    def skip_tag(self, el):
        """Determine if tag should be skipped."""

        skip = False
        for rule in self.ignores:
            if rule.tag and el.name.lower() != rule.tag:
                continue
            if rule.id and rule.id != el.attrs.get('id', '').lower():
                continue
            if rule.classes:
                current_classes = [c.lower() for c in el.attrs.get('class', [])]
                found = True
                for c in rule.classes:
                    if c not in current_classes:
                        found = False
                        break
                if not found:
                    continue
            skip = True
            break
        return skip

    def html_to_text(self, tree, root=True):
        """
        Parse the HTML creating a buffer with each tags content.

        Skip any selectors specified and include attributes if specified.
        Ignored tags will not have their attributes scanned either.
        """

        text = []

        if not self.skip_tag(tree):
            for attr in self.attributes:
                value = tree.attrs.get(attr)
                if value:
                    text.append(value)

            for child in tree:
                if isinstance(child, bs4.element.Tag):
                    if child.contents:
                        text.extend(self.html_to_text(child, False))
                else:
                    text.append(str(child))

        return ' '.join(text) if root else text

    def check_spelling(self, html_file):
        """Check spelling."""

        fail = False
        with codecs.open(html_file, 'r', encoding='utf-8') as file_obj:
            html = bs4.BeautifulSoup(file_obj.read(), "html5lib")
            text = self.html_to_text(html.html)

        wordlist = console(
            [
                'aspell',
                'list',
                '--lang=en',
                '--mode=url',
                '--encoding=utf-8',
                '--extra-dicts',
                self.dict_bin
            ],
            input_text=text.encode('utf-8')
        )
        words = [w for w in sorted(set(wordlist.split('\n'))) if w]

        if words:
            fail = True
            print('Misspelled words in %s' % html_file)
            print('-' * 80)
            for word in words:
                print(word)
            print('-' * 80)
            print('\n')
        return fail

    def check(self):
        """Walk documents and initiate spell check."""

        self.compile_dictionaries()

        print('Spell Checking...')
        fail = False
        for doc in self.docs:
            if os.path.isdir(doc):
                for base, dirs, files in os.walk(doc):
                    # Remove child folders based on exclude rules
                    for f in files:
                        if f.lower().endswith('.html'):
                            file_name = os.path.join(base, f)
                            if self.check_spelling(file_name):
                                fail = True
            elif doc.lower().endswith('.html'):
                if self.check_spelling(doc):
                    fail = True
        return fail


def main():
    """Main."""

    spelling = Spelling('.spelling.yml')
    return spelling.check()


if __name__ == "__main__":
    sys.exit(main())