Python Examples of re.html

Source File: x3270.py From Robot-Framework-Mainframe-3270-Library with MIT License

6 votes

def take_screenshot(self, height='410', width='670'):
        """Generate a screenshot of the IBM 3270 Mainframe in a html format. The
           default folder is the log folder of RobotFramework, if you want change see the `Set Screenshot Folder`.

           The Screenshot is printed in a iframe log, with the values of height=410 and width=670, you
           can change this values passing them from the keyword. 

           Examples:
               | Take Screenshot |
               | Take Screenshot | height=500 | width=700 |
        """
        filename_prefix = 'screenshot'
        extension = 'html'
        filename_sufix = str(int(round(time.time() * 1000)))
        filepath = os.path.join(self.imgfolder, '%s_%s.%s' % (filename_prefix, filename_sufix, extension))
        self.mf.save_screen(os.path.join(self.output_folder, filepath))
        logger.write('<iframe src="%s" height="%s" width="%s"></iframe>' % (filepath.replace("\\", "/"), height, width),
                     level='INFO', html=True)

Source File: regex.py From wextracto with BSD 3-Clause "New" or "Revised" License

6 votes

def re_group(pattern, group=1, flags=0):
    """
    Returns a :mod:`composable <wex.composed>` callable that 
    extract the specified group using a regular expression.

    :param pattern: The regular expression.
    :param group: The group from the `MatchObject <https://docs.python.org/2/library/re.html#re.MatchObject.group>`_.
    :param flags: Flags to use when compiling the 
                        `pattern <https://docs.python.org/2/library/re.html#re.compile>`_.
    """
    compiled = re.compile(pattern, flags)
    @composable
    def regroup(src):
        for string in flatten(src):
            for match in compiled.finditer(string):
                yield match.group(group)
    return regroup

Source File: regex.py From wextracto with BSD 3-Clause "New" or "Revised" License

6 votes

def re_groupdict(pattern, flags=0):
    """
    Returns a :mod:`composable <wex.composed>` callable that 
    extract the a group dictionary using a regular expression.

    :param pattern: The regular expression.
    :param flags: Flags to use when compiling the 
                        `pattern <https://docs.python.org/2/library/re.html#re.compile>`_.
    """
    compiled = re.compile(pattern, flags)
    compiled = re.compile(pattern, flags)
    @composable
    def redict(src):
        for string in flatten(src):
            for match in compiled.finditer(string):
                yield match.groupdict()
    return redict

Source File: sanity.py From reframe with BSD 3-Clause "New" or "Revised" License

6 votes

def assert_found(patt, filename, msg=None, encoding='utf-8'):
    '''Assert that regex pattern ``patt`` is found in the file ``filename``.

    :arg patt: The regex pattern to search.
        Any standard Python `regular expression
        <https://docs.python.org/3/library/re.html#regular-expression-syntax>`_
        is accepted.
    :arg filename: The name of the file to examine.
        Any :class:`OSError` raised while processing the file will be
        propagated as a :class:`reframe.core.exceptions.SanityError`.
    :arg encoding: The name of the encoding used to decode the file.
    :returns: ``True`` on success.
    :raises reframe.core.exceptions.SanityError: if assertion fails.
    '''
    num_matches = count(finditer(patt, filename, encoding))
    try:
        evaluate(assert_true(num_matches))
    except SanityError:
        error_msg = msg or "pattern `{0}' not found in `{1}'"
        raise SanityError(_format(error_msg, patt, filename))
    else:
        return True

Source File: sanity.py From reframe with BSD 3-Clause "New" or "Revised" License

6 votes

def extractall(patt, filename, tag=0, conv=None, encoding='utf-8'):
    '''Extract all values from the capturing group ``tag`` of a matching regex
    ``patt`` in the file ``filename``.

    :arg patt: The regex pattern to search.
        Any standard Python `regular expression
        <https://docs.python.org/3/library/re.html#regular-expression-syntax>`_
        is accepted.
    :arg filename: The name of the file to examine.
    :arg encoding: The name of the encoding used to decode the file.
    :arg tag: The regex capturing group to be extracted.
        Group ``0`` refers always to the whole match.
        Since the file is processed line by line, this means that group ``0``
        returns the whole line that was matched.
    :arg conv: A callable that takes a single argument and returns a new value.
        If provided, it will be used to convert the extracted values before
        returning them.
    :returns: A list of the extracted values from the matched regex.
    :raises reframe.core.exceptions.SanityError: In case of errors.
    '''
    return list(evaluate(x)
                for x in extractiter(patt, filename, tag, conv, encoding))

Source File: test_reader.py From IRCLogParser with GNU General Public License v3.0

6 votes

def test_linux_input_slack(self):
        expected_captured_output = util.load_from_disk(self.current_directory +"/data/stdout_captured_linux_input_slack")

        capturedOutput = StringIO.StringIO()
        sys.stdout = capturedOutput
        log_data = reader.linux_input_slack(self.current_directory + "/data/slackware/", self.starting_date, self.ending_date)
        output = capturedOutput.getvalue()
        capturedOutput.close()
        sys.stdout = sys.__stdout__

        #See https://docs.python.org/2/library/re.html for more details.
        # string 'Working on: /any_valid_path/IRCLogParser/test/unit-test/test_lib/test_in_out/data/log/2013/01/04/#kubuntu-devel.txt\n' is replaced by
        # 'Working on: IRCLogParser/test/unit-test/test_lib/test_in_out/data/log/2013/01/04/#kubuntu-devel.txt\n'
        output = re.sub(r'(?P<begin>.+ )/.+/(?P<constant>IRCLogParser/.+\n)',r'\g<begin>\g<constant>', output)
        self.assertEqual(log_data, self.log_data)
        self.assertEqual(expected_captured_output, output)

Source File: google_doc.py From kobo-predict with BSD 2-Clause "Simplified" License

6 votes

def _navigation_list(self, node=None):
        """
        Return an html representation of the table of contents for
        this document. This is done recursively adding on a list item
        for each element in the tree, and an unordered list if this
        node has children. I might want to double check that this html
        is the correct way to nest lists.
        """
        if node is None:
            self._construct_section_tree()
            return self._navigation_list(self._section_tree)
        result = ""
        if 'title' in node.value and 'id' in node.value:
            result += '<li>%s</li>' % node.value.url()
        if len(node) > 0:
            result += "<ul>%s</ul>" % \
                "\n".join([self._navigation_list(child) for child in node])
        return result

Source File: google_doc.py From kobo-predict with BSD 2-Clause "Simplified" License

6 votes

def _construct_section_tree(self):
        """
        For some weird reason Google Documents doesn't like nesting
        lists, so their table of contents requires a bunch of special
        formatting. Instead of trying to hack off what they provide
        us, we create a tree of sections based on each sections
        level. This tree will be used to construct the html for the
        table of contents.
        """
        self._section_tree = TreeNode(Section(level=0))
        current_node = self._section_tree
        for section in self._sections:
            while section['level'] <= current_node.value['level']:
                current_node = current_node.parent
            while section['level'] > current_node.value['level'] + 1:
                empty_section = Section(level=current_node.value['level'] + 1)
                current_node = current_node.add_child(empty_section)
            assert section['level'] == current_node.value['level'] + 1
            current_node = current_node.add_child(section)

Source File: google_doc.py From kobo-predict with BSD 2-Clause "Simplified" License

6 votes

def set_html(self, html):
        """
        When setting the html for this Google Document we do two
        things:

        1. We extract the content from the html. Using a regular
           expression we pull the meat of the document out of the body
           of the html, we also cut off the footer Google adds on
           automatically.

        2. We extract the various sections from the content of the
           document. Again using a regular expression, we look for h1,
           h2, ... tags to split the document up into sections. Note:
           it is important when you are writing your Google Document
           to use the heading text styles, so this code will split
           things correctly.
        """
        self._html = html
        self._extract_content()
        self._extract_sections()

Source File: test_reader.py From IRCLogParser with GNU General Public License v3.0

6 votes

def test_linux_input(self):
        expected_capturedOutput = util.load_from_disk(self.current_directory + "/data/stdout_captured_linux_input")

        capturedOutput = StringIO.StringIO()
        sys.stdout = capturedOutput
        log_data = reader.linux_input(self.current_directory + "/data/log/", self.channel_name, self.starting_date, self.ending_date)
        output = capturedOutput.getvalue()
        capturedOutput.close()
        sys.stdout = sys.__stdout__
        #See https://docs.python.org/2/library/re.html for more details.
        # string 'Working on: /any_valid_path/IRCLogParser/test/unit-test/test_lib/test_in_out/data/log/2013/01/04/#kubuntu-devel.txt\n' is replaced by
        # 'Working on: IRCLogParser/test/unit-test/test_lib/test_in_out/data/log/2013/01/04/#kubuntu-devel.txt\n'
        output = re.sub(r'(?P<begin>.+ )/.+/(?P<constant>IRCLogParser/.+\n)', r'\g<begin>\g<constant>', output)

        self.assertEqual(log_data, self.log_data)
        self.assertEqual(expected_capturedOutput, output)

Source File: test_reader.py From IRCLogParser with GNU General Public License v3.0

6 votes

def test_linux_input_all_channels(self):
        expected_capturedOutput = util.load_from_disk(self.current_directory + "/data/stdout_captured_linux_input_all_channels")
        expected_log_data = util.load_from_disk(self.current_directory + "/data/log_data_for_test_linux_input_all_channels")

        capturedOutput = StringIO.StringIO()
        sys.stdout = capturedOutput
        log_data = reader.linux_input(self.current_directory + "/data/log_to_test_for_all_channels/", ["ALL"], "2013-1-1", "2013-1-2")
        output = capturedOutput.getvalue()
        capturedOutput.close()
        sys.stdout = sys.__stdout__

        #See https://docs.python.org/2/library/re.html for more details.
        output = re.sub(r'(?P<begin>.+ )/.+/(?P<constant>IRCLogParser/.+\n)', r'\g<begin>\g<constant>', output)

        self.assertEqual(expected_log_data, log_data)
        self.assertEqual(expected_capturedOutput, output)

Source File: cgnsutils.py From pyCGNS with GNU Lesser General Public License v2.1

5 votes

def getPathsByTypeFilter(tree, filter):
    """
    Returns a list of paths from T matching the filter. The filter is a
    `regular expression <http://docs.python.org/library/re.html>`_
    used to match the path of **node types**::

      # gets GridConnectivity_t and GridConnectivity1to1_t
      allconnectivities=getPathsByTypeFilter(T,'/.*/.*/.*/GridConnectivity.*')

    :arg node tree: the target tree to parse
    :arg str filter: a regular expression for the complete path to match to

    :Return:
      - A list of paths (str) matching the types-path pattern
      - Returns empty list if no match

    :Remarks:
      - The '/' is the separator for the path tokens, so you cannot use it
       in the regular expression for any other purpose
      - Always skips `CGNSTree_t`
    """
    recmplist = []
    restrlist = filter.split('/')[1:]
    for restr in restrlist:
        recmplist.append(re.compile(restr))
    lpth = getAllPaths(tree)
    rpth = []
    for p in lpth:
        pl = getPathAsTypes(tree, p)[1:]
        if len(pl) != len(recmplist):
            continue
        for n, pattern in enumerate(recmplist):
            if isinstance(pl[n], str) and (pattern.match(pl[n]) is None):
                break
        else:
            rpth.append(p)
    return rpth


# --------------------------------------------------

Source File: x3270.py From Robot-Framework-Mainframe-3270-Library with MIT License

5 votes

def set_screenshot_folder(self, path):
        """Set a folder to keep the html files generated by the `Take Screenshot` keyword.

           Example:
               | Set Screenshot Folder | C:\\\Temp\\\Images |
        """
        if os.path.exists(os.path.normpath(os.path.join(self.output_folder, path))):
            self.imgfolder = path
        else:
            logger.error('Given screenshots path "%s" does not exist' % path)
            logger.warn('Screenshots will be saved in "%s"' % self.imgfolder)

Source File: x3270.py From Robot-Framework-Mainframe-3270-Library with MIT License

5 votes

def page_should_match_regex(self, regex_pattern):
        """Fails if string does not match pattern as a regular expression. Regular expression check is
        implemented using the Python [https://docs.python.org/2/library/re.html|re module]. Python's
        regular expression syntax is derived from Perl, and it is thus also very similar to the syntax used,
        for example, in Java, Ruby and .NET.

        Backslash is an escape character in the test data, and possible backslashes in the pattern must
        thus be escaped with another backslash (e.g. \\\d\\\w+).
        """
        page_text = self._read_all_screen()
        if not re.findall(regex_pattern, page_text, re.MULTILINE):
            raise Exception('No matches found for "' + regex_pattern + '" pattern')

Source File: x3270.py From Robot-Framework-Mainframe-3270-Library with MIT License

5 votes

def page_should_not_match_regex(self, regex_pattern):
        """Fails if string does match pattern as a regular expression. Regular expression check is
        implemented using the Python [https://docs.python.org/2/library/re.html|re module]. Python's
        regular expression syntax is derived from Perl, and it is thus also very similar to the syntax used,
        for example, in Java, Ruby and .NET.

        Backslash is an escape character in the test data, and possible backslashes in the pattern must
        thus be escaped with another backslash (e.g. \\\d\\\w+).
        """
        page_text = self._read_all_screen()
        if re.findall(regex_pattern, page_text, re.MULTILINE):
            raise Exception('There are matches found for "' + regex_pattern + '" pattern')

Source File: plugin.py From mkdocstrings with ISC License

5 votes

def on_page_content(self, html: str, page: Page, config: Config, files: Files, **kwargs) -> str:
        """
        Hook for the [`on_page_contents` event](https://www.mkdocs.org/user-guide/plugins/#on_page_contents).

        In this hook, we map the IDs of every anchor found in the table of contents to the anchors absolute URLs.
        This mapping will be used later to fix unresolved reference of the form `[title][identifier]` or
        `[identifier][]`.
        """
        log.debug(f"mkdocstrings.plugin: Mapping identifiers to URLs for page {page.file.src_path}")
        for item in page.toc.items:
            self.map_urls(page.canonical_url, item)
        return html

Source File: plugin.py From mkdocstrings with ISC License

5 votes

def on_post_page(self, output: str, page: Page, config: Config, **kwargs) -> str:
        """
        Hook for the [`on_post_page` event](https://www.mkdocs.org/user-guide/plugins/#on_post_page).

        In this hook, we try to fix unresolved references of the form `[title][identifier]` or `[identifier][]`.
        Doing that allows the user of `mkdocstrings` to cross-reference objects in their documentation strings.
        It uses the native Markdown syntax so it's easy to remember and use.

        We log a warning for each reference that we couldn't map to an URL, but try to be smart and ignore identifiers
        that do not look legitimate (sometimes documentation can contain strings matching
        our [`AUTO_REF`][mkdocstrings.plugin.AUTO_REF] regular expression that did not intend to reference anything).
        We currently ignore references when their identifier contains a space or a slash.
        """
        log.debug(f"mkdocstrings.plugin: Fixing references in page {page.file.src_path}")

        placeholder = Placeholder()
        while re.search(placeholder.seed, output) or any(placeholder.seed in url for url in self.url_map.values()):
            placeholder.set_seed()

        unmapped, unintended = [], []  # type: ignore
        soup = BeautifulSoup(output, "html.parser")
        placeholder.replace_code_tags(soup)
        fixed_soup = AUTO_REF.sub(self.fix_ref(unmapped, unintended), str(soup))

        if unmapped or unintended:
            # We do nothing with unintended refs
            if unmapped and log.isEnabledFor(logging.WARNING):
                for ref in unmapped:
                    log.warning(
                        f"mkdocstrings.plugin: {page.file.src_path}: Could not find cross-reference target '[{ref}]'"
                    )

        return placeholder.restore_code_tags(fixed_soup)

Source File: cgnsutils.py From pyCGNS with GNU Lesser General Public License v2.1

5 votes

def getPathsByTokenFilter(tree, filter):
    """
    Returns a list of paths from T matching the filter. The filter is a
    `regular expression <http://docs.python.org/library/re.html>`_
    used to match at least one of the token of the path::

     import CGNS.PAT.cgnskeywords as CK

     for path in getPathsByTokenFilter(T,'Family.*'):
         print 'Family ',path,' is ',path[2]

    :arg CGNS/Python tree: target tree to parse
    :arg str filter: a regular expression for the token to match to
    :return:
      - A list of paths (strings) matching the path pattern
      - Returns empty list if no match
    :Remarks:
      - You cannot use the regex to match for a path
      - Always skips `CGNSTree_t`
    """
    lpth = getAllPaths(tree)
    reg = re.compile(filter)
    rpth = []
    for p in lpth:
        pl = getPathToList(p, True)
        for tk in pl:
            if reg.match(tk) is not None:
                rpth.append(p)
    return rpth


# --------------------------------------------------

Source File: x3270.py From Robot-Framework-Mainframe-3270-Library with MIT License

5 votes

def execute_command(self, cmd):
        """Execute an [http://x3270.bgp.nu/wc3270-man.html#Actions|x3270 command].

           Examples:
               | Execute Command | Enter |
               | Execute Command | Home |
               | Execute Command | Tab |
               | Execute Command | PF(1) |
        """
        self.mf.exec_command((str(cmd)).encode("utf-8"))
        time.sleep(self.wait)

Source File: wdl_parser.py From toil with Apache License 2.0

5 votes

def __init__(self, string, resource, errors, user_context):
        self.__dict__.update(locals())
        self.stack = ['default']
        self.line = 1
        self.col = 1
        self.tokens = []
        self.user_context = user_context
        self.re_match = None # https://docs.python.org/3/library/re.html#match-objects

Source File: sanity.py From reframe with BSD 3-Clause "New" or "Revised" License

5 votes

def findall(patt, filename, encoding='utf-8'):
    '''Get all matches of regex ``patt`` in ``filename``.

    :arg patt: The regex pattern to search.
        Any standard Python `regular expression
        <https://docs.python.org/3/library/re.html#regular-expression-syntax>`_
        is accepted.
    :arg filename: The name of the file to examine.
    :arg encoding: The name of the encoding used to decode the file.
    :returns: A list of raw `regex match objects
        <https://docs.python.org/3/library/re.html#match-objects>`_.
    :raises reframe.core.exceptions.SanityError: In case an :class:`OSError` is
        raised while processing ``filename``.
    '''
    return list(evaluate(x) for x in finditer(patt, filename, encoding))

Source File: filters.py From AsyncLine with MIT License

5 votes

def regex(pattern, flags: int = 0):
		"""Filter messages that match a given RegEx pattern.
		
			Args:
				pattern (``str``):
					The RegEx pattern as string, it will be applied to the text of a message. When a pattern matches,
					all the `Match Objects <https://docs.python.org/3/library/re.html#match-objects>`
				
				flags (``int``, *optional*):
					RegEx flags.
		"""		
		def f(_, m):		
			m.matches = [i for i in _.p.finditer(m.text or "")]
			return bool(m.matches)
		return create("Regex", f, p=re.compile(pattern, flags))

Source File: __init__.py From dlisio with GNU Lesser General Public License v3.0

5 votes

def unknowns(self):
        """Return all objects that are unknown to dlisio.

        Unknown objects are object-types that dlisio does not know about. By
        default, any metadata object not defined by rp66v1 [1]. The are all
        parsed as :py:class:`dlisio.plumbing.Unknown`, that implements a dict
        interface.

        [1] http://w3.energistics.org/rp66/v1/Toc/main.html

        Notes
        -----
        Adding a custom python class for an object-type to dlis.types will
        in-effect remove all objects of that type from unknowns.

        Returns
        -------
        objects : defaultdict(list)
            A defaultdict index by object-type

        """
        recs = [rec for rec, t in zip(self.attic, self.record_types)
            if  t not in self.types
            and not rec.encrypted
            and t not in self.indexedobjects
        ]

        self.load(recs, reload=False)

        unknowns = defaultdict(list)

        for key, value in self.indexedobjects.items():
            if key in self.types: continue
            unknowns[key] = value

        return unknowns

Source File: core.py From pyparsing with MIT License

5 votes

def sub(self, repl):
        r"""
        Return :class:`Regex` with an attached parse action to transform the parsed
        result as if called using `re.sub(expr, repl, string) <https://docs.python.org/3/library/re.html#re.sub>`_.

        Example::

            make_html = Regex(r"(\w+):(.*?):").sub(r"<\1>\2</\1>")
            print(make_html.transformString("h1:main title:"))
            # prints "<h1>main title</h1>"
        """
        if self.asGroupList:
            warnings.warn(
                "cannot use sub() with Regex(asGroupList=True)",
                SyntaxWarning,
                stacklevel=2,
            )
            raise SyntaxError()

        if self.asMatch and callable(repl):
            warnings.warn(
                "cannot use sub() with a callable with Regex(asMatch=True)",
                SyntaxWarning,
                stacklevel=2,
            )
            raise SyntaxError()

        if self.asMatch:

            def pa(tokens):
                return tokens[0].expand(repl)

        else:

            def pa(tokens):
                return self.re.sub(repl, tokens[0])

        return self.addParseAction(pa)

Source File: core.py From cmt with MIT License

5 votes

def sub(self, repl):
        r"""
        Return Regex with an attached parse action to transform the parsed
        result as if called using `re.sub(expr, repl, string) <https://docs.python.org/3/library/re.html#re.sub>`_.

        Example::

            make_html = Regex(r"(\w+):(.*?):").sub(r"<\1>\2</\1>")
            print(make_html.transformString("h1:main title:"))
            # prints "<h1>main title</h1>"
        """
        if self.asGroupList:
            warnings.warn(
                "cannot use sub() with Regex(asGroupList=True)",
                SyntaxWarning,
                stacklevel=2,
            )
            raise SyntaxError()

        if self.asMatch and callable(repl):
            warnings.warn(
                "cannot use sub() with a callable with Regex(asMatch=True)",
                SyntaxWarning,
                stacklevel=2,
            )
            raise SyntaxError()

        if self.asMatch:

            def pa(tokens):
                return tokens[0].expand(repl)

        else:

            def pa(tokens):
                return self.re.sub(repl, tokens[0])

        return self.addParseAction(pa)

Source File: utils.py From dag-factory with MIT License

5 votes

def get_time_delta(time_string: str) -> timedelta:
    """
    Takes a time string (1 hours, 10 days, etc.) and returns
    a python timedelta object

    :param time_string: the time value to convert to a timedelta
    :type time_string: str
    :returns: datetime.timedelta for relative time
    :type datetime.timedelta
    """
    # pylint: disable=line-too-long
    rel_time: Pattern = re.compile(
        pattern=r"((?P<hours>\d+?)\s+hour)?((?P<minutes>\d+?)\s+minute)?((?P<seconds>\d+?)\s+second)?((?P<days>\d+?)\s+day)?",
        # noqa
        flags=re.IGNORECASE,
    )
    parts: Optional[Match[AnyStr]] = rel_time.match(string=time_string)
    if not parts:
        raise Exception(f"Invalid relative time: {time_string}")
    # https://docs.python.org/3/library/re.html#re.Match.groupdict
    parts: Dict[str, str] = parts.groupdict()
    time_params = {}
    if all(value is None for value in parts.values()):
        raise Exception(f"Invalid relative time: {time_string}")
    for time_unit, magnitude in parts.items():
        if magnitude:
            time_params[time_unit]: int = int(magnitude)
    return timedelta(**time_params)

Source File: dumpgenerator.py From wikiteam with GNU General Public License v3.0

5 votes

def saveSpecialVersion(config={}, session=None):
    """ Save Special:Version as .html, to preserve extensions details """

    if os.path.exists('%s/Special:Version.html' % (config['path'])):
        print 'Special:Version.html exists, do not overwrite'
    else:
        print 'Downloading Special:Version with extensions and other related info'
        r = session.post(
            url=config['index'], params={'title': 'Special:Version'}, timeout=10)
        raw = r.text
        delay(config=config, session=session)
        raw = removeIP(raw=raw)
        with open('%s/Special:Version.html' % (config['path']), 'w') as outfile:
            outfile.write(raw.encode('utf-8'))

Source File: utils.py From sncli with MIT License

5 votes

def build_regex_search(search_string):
    """
    Build up a compiled regular expression from the search string.

    Supports the use of flags - ie. search for `nothing/i` will perform a
    case-insensitive regex for `nothing`
    """

    sspat = None
    valid_flags = {
            'i': re.IGNORECASE
    }
    if search_string:
        try:
            search_string, flag_letters = re.match(r'^(.+?)(?:/([a-z]+))?$', search_string).groups()
            flags = 0
            # if flags are given, OR together all the valid flags
            # see https://docs.python.org/3/library/re.html#re.compile
            if flag_letters:
                for letter in flag_letters:
                    if letter in valid_flags:
                        flags = flags | valid_flags[letter]
            sspat = re.compile(search_string, flags)
        except re.error:
            sspat = None

    return sspat

Source File: google_doc.py From kobo-predict with BSD 2-Clause "Simplified" License

5 votes

def to_html(self):
        return render_to_string('section.html', self)

Source File: google_doc.py From kobo-predict with BSD 2-Clause "Simplified" License

5 votes

def _extract_sections(self):
        """
        Here is an example of what a section header looks like in the
        html of a Google Document:

        <h3 class="c1"><a name="h.699ffpepx6zs"></a><span>Hello World
        </span></h3>

        We split the content of the Google Document up using a regular
        expression that matches the above header. re.split is a pretty
        cool function if you haven't tried it before. It puts the
        matching groups into the list as well as the content between
        the matches. Check it out here:

        http://docs.python.org/library/re.html#re.split

        One big thing we do in this method is replace the ugly section
        id that Google creates with a nicely slugified version of the
        section title. This makes for pretty urls.
        """
        self._sections = []
        header = r'<h(?P<level>\d) class="[^"]+">' \
            r'<a name="(?P<id>[^"]+)"></a>'      \
            r'<span>(?P<title>[^<]+)</span>'     \
            r'</h\d>'
        l = re.split(header, self._content)
        l.pop(0)
        while l:
            section = Section(
                # hack: cause we started with h3 in google docs
                level=int(l.pop(0)) - 2,
                id=l.pop(0),
                title=l.pop(0).decode('utf8'),
                content=l.pop(0),
                )
            section['id'] = slugify(section['title'])
            if section['level'] >= 1:
                self._sections.append(section)

Python re.html() Examples