Source code for latexml_html_cleaner.clean_html

"""
Class definition of htmlcleaner
"""

import logging
import re
import sys
from pathlib import Path

from bs4 import BeautifulSoup

_logger = logging.getLogger(__name__)
import contextlib



[docs]
@contextlib.contextmanager
def smart_open(filename=None):
    """
    Context manager for a smart file opener for reading from file or standard input

    Args:
        filename (Path): Path to the file to open

    Return:
        file: file-like object
    """
    if filename and filename.as_posix() != "-":
        fh = open(filename, mode="w", encoding="utf-8")
    else:
        fh = sys.stdout

    try:
        yield fh
    finally:
        if fh is not sys.stdout:
            fh.close()




[docs]
def to_lijst(values):
    """Convert a string or a list of strings to a list of strings"""
    if not isinstance(values, list):
        lijst_values = [values]
    else:
        lijst_values = values
    return lijst_values




[docs]
class HTMLCleaner:
    """
    Class to clean the contents of an html-file

    Args:
        filename (Path): path to the html-file to clean
        skip_tags (bool, optional): do not use the default tags
        overwrite (bool, optional): overwrite the existing file if it exists
        find_and_replace_patterns: (dict, optional): replace these patterns
        clear_default_patterns (bool, optional): clear the default patterns if they exist
        output_filename (Path, optional): filename to write too. If not given, base the name on the input file

    Attributes:
        clean_soup (BeautifulSoup): BeautifulSoup object

    Notes:
        * By default, all attributes starting with ltx are skipped.
        * With ```skip_tags``` we can drop entire ```<>``` environments based on the environments name (the key of
          the dict) and then a list of attributes key/values pairs.
        * If such a key/value pair occurs, the entire ```<>``` tag is discarded.
        * We first define a default list in this example: a tag
          ```<span class="ltx_bibblock ltx_bib_cited">Cited by etc. </span>```
          is discarded in its entirely, including all nested values.
        * We can also specify the values of a key/value pair in a list if there are more than one tags have the same
          key names, but different values.
    """

    def __init__(
        self,
        filename,
        skip_tags=None,
        overwrite=False,
        find_and_replace_patterns=None,
        clear_default_patterns=False,
        output_filename=None,
    ):
        """
        Constructor for HTMLCleaner
        """
        self.filename = Path(filename)
        self.clean_soup = None
        _logger.debug(f"Make filename for {filename}")
        file_basename = self.filename.with_suffix("")
        if not overwrite:
            file_basename = Path("_".join([file_basename.as_posix(), "clean"]))
        if output_filename is None:
            self.output_file = file_basename.with_suffix(".html")
        else:
            self.output_file = Path(output_filename)
        _logger.debug(f"Cleaning from {filename} to {self.output_file}")

        # Default find en place. Always remove all double white lines
        if clear_default_patterns:
            self.find_and_replace_patterns = {}
        else:
            # Start with some default strings that we are going to delete
            self.find_and_replace_patterns = {
                "\n{2,}": "\n\n",
                "<span>•</span>": "",
                "<span><span>–</span></span>": "",
                "title=": "<b>Intermezzo:</b> ",
                "Â": "",
            }

        if find_and_replace_patterns is not None:
            for find, replace in find_and_replace_patterns.items():
                self.find_and_replace_patterns[find] = replace

        if skip_tags is None:
            self.skip_tags = {
                "span": {
                    "class": ["ltx_bibblock ltx_bib_cited", "ltx_tag ltx_tag_item"]
                },
                "link": {"rel": None},
                "meta": {"content": None},
                "div": {"class": "ltx_dates"},
                "footer": {None: None},
                "header": {None: None},
            }
        else:
            self.skip_tags = skip_tags

        self.skip_tag_attributes = {
            "a": {
                "href": ["^(A|Ch)\d+.html$", ".*[\d\w\.%]#[\d\w\.%].*", "^#.*"],
                "title": "",
            },
            "li": {"style": None},
            "span": {"style": None},
            None: {None: "ltx_", "id": None},
        }

        self.clean_html()


[docs]
    def clean_html(self):
        """
        Read the html file and clean the html code

        """

        with open(file=self.filename, encoding="utf-8") as stream:
            html = stream.read()
        soup = BeautifulSoup(html, "html.parser")
        _logger.debug(f"Start cleaning ")
        _logger.debug(soup)

        # Remove all attributes starting with ltx (which are latexml definitions)
        for tags in soup.findAll(True):
            new_attrs = {}
            for attr_key, attr_value in tags.attrs.items():
                attributes = to_lijst(attr_value)

                skip_this = skip_this_tag(
                    tag=tags,
                    attribute_key=attr_key,
                    attribute_values=attributes,
                    skip_tags=self.skip_tags,
                    combined=True,
                )
                if skip_this:
                    # This tag is skipped entirely
                    _logger.debug(f"Dropping complete tag  {attr_key} {attr_value}")
                    tags.extract()
                    continue

                skip_this = skip_this_tag(
                    tag=tags,
                    attribute_key=attr_key,
                    attribute_values=attributes,
                    skip_tags=self.skip_tag_attributes,
                )

                # We collected the attributes in skip_this. Now remove it from the
                # attributes of the current tag

                # av are the current attributes of our tag
                av = set(attributes)
                try:
                    sv = set(skip_this[attr_key])
                except KeyError:
                    sv = {}
                # sv contains the tags we want to remove. Subtract this from the current one
                new_attrs_values = av.difference(sv)
                if new_attrs_values:
                    # als we nog attributes over houden stoppen we deze in de nieuwe attributes
                    new_attrs[attr_key] = list(new_attrs_values)
                else:
                    # we have nothing left so, we can omit this attr key
                    _logger.debug(f"Dropping {attr_key} from {tags.name}")

            # Overwrite the old attributes with our new ones
            tags.attrs = new_attrs

        # Here we can still remove elements based on normal string matches
        self.clean_soup = str(soup)
        for find, replace in self.find_and_replace_patterns.items():
            _logger.debug(f"replacing {find} with {replace}")
            self.clean_soup = re.sub(find, replace, self.clean_soup)

        _logger.info(f"Cleaning: {self.filename} -> {self.output_file}")

        with smart_open(self.output_file) as stream:
            stream.write(self.clean_soup)





[docs]
def skip_this_tag(tag, attribute_key, attribute_values, skip_tags, combined=False):
    """
    Collect all the tags and attributes we want to remove

    Args:
        tag (object):  beautiful soup tag to clean
        attribute_key (str): key of the attribute
        attribute_values (list): values of the attribute
        skip_tags (bool): skip the tag if true
        combined (bool, optional): only remove the tag in case we match the combined tag

    Returns:
        list: all the tags and attributes to skip
    """
    attributes = " ".join(attribute_values)
    tags_to_skip = {}
    for skip_tag_name, skip_tag_attributes in skip_tags.items():
        if skip_tag_name == tag.name or skip_tag_name is None:
            for skip_atr_key, skip_atr_value in skip_tag_attributes.items():
                if skip_atr_key == attribute_key or skip_atr_key is None:
                    for skip_value in to_lijst(skip_atr_value):
                        if combined:
                            # If combined is true then your match must apply to the combined
                            # attr string, such as 'ltx_bibblock ltx_bib_cited'
                            if skip_value is None or skip_value == attributes:
                                tags_to_skip[attribute_key] = attribute_values
                        else:
                            # If combined is not true, we get or per item in the list
                            # there is a match based on a regular expression. All matches
                            # are deleted
                            for av in attribute_values:
                                try:
                                    add = (
                                        av is None
                                        or skip_value is None
                                        or re.match(skip_value, av) is not None
                                    )
                                except TypeError:
                                    _logger.warning(
                                        f"Failed to do regular expression for {skip_value}"
                                    )
                                else:
                                    if add:
                                        try:
                                            tags_to_skip[attribute_key].append(av)
                                        except KeyError:
                                            tags_to_skip[attribute_key] = [av]
    return tags_to_skip