Source code for latexml_html_cleaner.clean_html

"""
Class definition of htmlcleaner
"""

import logging
import re
import sys
from pathlib import Path

from bs4 import BeautifulSoup

_logger = logging.getLogger(__name__)
import contextlib


[docs] @contextlib.contextmanager def smart_open(filename=None): """ Context manager for a smart file opener for reading from file or standard input Args: filename (Path): Path to the file to open Return: file: file-like object """ if filename and filename.as_posix() != "-": fh = open(filename, mode="w", encoding="utf-8") else: fh = sys.stdout try: yield fh finally: if fh is not sys.stdout: fh.close()
[docs] def to_lijst(values): """Convert a string or a list of strings to a list of strings""" if not isinstance(values, list): lijst_values = [values] else: lijst_values = values return lijst_values
[docs] class HTMLCleaner: """ Class to clean the contents of an html-file Args: filename (Path): path to the html-file to clean skip_tags (bool, optional): do not use the default tags overwrite (bool, optional): overwrite the existing file if it exists find_and_replace_patterns: (dict, optional): replace these patterns clear_default_patterns (bool, optional): clear the default patterns if they exist output_filename (Path, optional): filename to write too. If not given, base the name on the input file Attributes: clean_soup (BeautifulSoup): BeautifulSoup object Notes: * By default, all attributes starting with ltx are skipped. * With ```skip_tags``` we can drop entire ```<>``` environments based on the environments name (the key of the dict) and then a list of attributes key/values pairs. * If such a key/value pair occurs, the entire ```<>``` tag is discarded. * We first define a default list in this example: a tag ```<span class="ltx_bibblock ltx_bib_cited">Cited by etc. </span>``` is discarded in its entirely, including all nested values. * We can also specify the values of a key/value pair in a list if there are more than one tags have the same key names, but different values. """ def __init__( self, filename, skip_tags=None, overwrite=False, find_and_replace_patterns=None, clear_default_patterns=False, output_filename=None, ): """ Constructor for HTMLCleaner """ self.filename = Path(filename) self.clean_soup = None _logger.debug(f"Make filename for {filename}") file_basename = self.filename.with_suffix("") if not overwrite: file_basename = Path("_".join([file_basename.as_posix(), "clean"])) if output_filename is None: self.output_file = file_basename.with_suffix(".html") else: self.output_file = Path(output_filename) _logger.debug(f"Cleaning from {filename} to {self.output_file}") # Default find en place. Always remove all double white lines if clear_default_patterns: self.find_and_replace_patterns = {} else: # Start with some default strings that we are going to delete self.find_and_replace_patterns = { "\n{2,}": "\n\n", "<span>•</span>": "", "<span><span>–</span></span>": "", "title=": "<b>Intermezzo:</b> ", "Â": "", } if find_and_replace_patterns is not None: for find, replace in find_and_replace_patterns.items(): self.find_and_replace_patterns[find] = replace if skip_tags is None: self.skip_tags = { "span": { "class": ["ltx_bibblock ltx_bib_cited", "ltx_tag ltx_tag_item"] }, "link": {"rel": None}, "meta": {"content": None}, "div": {"class": "ltx_dates"}, "footer": {None: None}, "header": {None: None}, } else: self.skip_tags = skip_tags self.skip_tag_attributes = { "a": { "href": ["^(A|Ch)\d+.html$", ".*[\d\w\.%]#[\d\w\.%].*", "^#.*"], "title": "", }, "li": {"style": None}, "span": {"style": None}, None: {None: "ltx_", "id": None}, } self.clean_html()
[docs] def clean_html(self): """ Read the html file and clean the html code """ with open(file=self.filename, encoding="utf-8") as stream: html = stream.read() soup = BeautifulSoup(html, "html.parser") _logger.debug(f"Start cleaning ") _logger.debug(soup) # Remove all attributes starting with ltx (which are latexml definitions) for tags in soup.findAll(True): new_attrs = {} for attr_key, attr_value in tags.attrs.items(): attributes = to_lijst(attr_value) skip_this = skip_this_tag( tag=tags, attribute_key=attr_key, attribute_values=attributes, skip_tags=self.skip_tags, combined=True, ) if skip_this: # This tag is skipped entirely _logger.debug(f"Dropping complete tag {attr_key} {attr_value}") tags.extract() continue skip_this = skip_this_tag( tag=tags, attribute_key=attr_key, attribute_values=attributes, skip_tags=self.skip_tag_attributes, ) # We collected the attributes in skip_this. Now remove it from the # attributes of the current tag # av are the current attributes of our tag av = set(attributes) try: sv = set(skip_this[attr_key]) except KeyError: sv = {} # sv contains the tags we want to remove. Subtract this from the current one new_attrs_values = av.difference(sv) if new_attrs_values: # als we nog attributes over houden stoppen we deze in de nieuwe attributes new_attrs[attr_key] = list(new_attrs_values) else: # we have nothing left so, we can omit this attr key _logger.debug(f"Dropping {attr_key} from {tags.name}") # Overwrite the old attributes with our new ones tags.attrs = new_attrs # Here we can still remove elements based on normal string matches self.clean_soup = str(soup) for find, replace in self.find_and_replace_patterns.items(): _logger.debug(f"replacing {find} with {replace}") self.clean_soup = re.sub(find, replace, self.clean_soup) _logger.info(f"Cleaning: {self.filename} -> {self.output_file}") with smart_open(self.output_file) as stream: stream.write(self.clean_soup)
[docs] def skip_this_tag(tag, attribute_key, attribute_values, skip_tags, combined=False): """ Collect all the tags and attributes we want to remove Args: tag (object): beautiful soup tag to clean attribute_key (str): key of the attribute attribute_values (list): values of the attribute skip_tags (bool): skip the tag if true combined (bool, optional): only remove the tag in case we match the combined tag Returns: list: all the tags and attributes to skip """ attributes = " ".join(attribute_values) tags_to_skip = {} for skip_tag_name, skip_tag_attributes in skip_tags.items(): if skip_tag_name == tag.name or skip_tag_name is None: for skip_atr_key, skip_atr_value in skip_tag_attributes.items(): if skip_atr_key == attribute_key or skip_atr_key is None: for skip_value in to_lijst(skip_atr_value): if combined: # If combined is true then your match must apply to the combined # attr string, such as 'ltx_bibblock ltx_bib_cited' if skip_value is None or skip_value == attributes: tags_to_skip[attribute_key] = attribute_values else: # If combined is not true, we get or per item in the list # there is a match based on a regular expression. All matches # are deleted for av in attribute_values: try: add = ( av is None or skip_value is None or re.match(skip_value, av) is not None ) except TypeError: _logger.warning( f"Failed to do regular expression for {skip_value}" ) else: if add: try: tags_to_skip[attribute_key].append(av) except KeyError: tags_to_skip[attribute_key] = [av] return tags_to_skip