Source code for timApp.documentmodel.documentparser

import re

from typing import Optional

from documentmodel.attributeparser import AttributeParser
from documentmodel.documentparseroptions import DocumentParserOptions
from documentmodel.randutils import hashfunc, random_id, is_valid_id
from utils import count_chars


[docs]class SplitterException(Exception): pass
[docs]class ValidationException(Exception): pass
[docs]class ValidationWarning(ValidationException): pass
[docs]class DocReader: """ :type lines: list[str] :type i: int :type current_line: int :param lines: :param i: """ def __init__(self, lines, i=0): self.lines = lines self.current_line = i
[docs] def peek_line(self): """ :rtype: str :return: """ return self.lines[self.current_line]
[docs] def get_line_and_advance(self): result = self.peek_line() self.current_line += 1 return result
[docs] def has_more_lines(self): return self.current_line < len(self.lines)
[docs]class DocumentParser: """Splits documents into paragraphs. :type _blocks: list[dict] :type _doc_text: str :type _last_setting: tuple """ def __init__(self, doc_text=''): """ :type doc_text: str """ self._doc_text = doc_text self._blocks = None self._break_on_empty_line = False self._last_setting = None # type: DocumentParserOptions
[docs] def get_blocks(self, options: Optional[DocumentParserOptions]=None): if options is None: options = DocumentParserOptions() self._parse_document(options) return self._blocks
[docs] def add_missing_attributes(self, hash_func=hashfunc, id_func=random_id): self._parse_document(self._last_setting) for r in self._blocks: r['t'] = hash_func(r['md'], r['attrs']) if not r.get('id'): r['id'] = id_func() return self
[docs] def validate_structure(self, id_validator_func=is_valid_id, is_whole_document=True): self._parse_document(self._last_setting) found_ids = set() found_tasks = set() found_areas = set() classed_areas = [] found_area_ends = set() for r in self._blocks: if r['type'] == 'code': md = r['md'] try: last_line = md[md.rindex('\n')+1:] num_ticks = count_chars(md, '`') if last_line.startswith('`'*num_ticks): attrs, start_index = AttributeParser(last_line).get_attributes() if start_index is not None: raise ValidationException('The end of code block contains attributes: {}'.format(attrs)) except ValueError: pass curr_id = r.get('id') if curr_id is not None: if curr_id in found_ids: raise ValidationException('Duplicate paragraph id: ' + curr_id) found_ids.add(curr_id) if not id_validator_func(curr_id): raise ValidationException('Invalid paragraph id: ' + curr_id) attrs = r.get('attrs', {}) task_id = attrs.get('taskId') if task_id: if task_id in found_tasks: # Duplicate task id's are not fatal, but still something we could warn the user about. # For now, just ignore them. #print('Duplicate task id: ' + task_id) #raise ValidationException('Duplicate task id: ' + task_id) pass #found_tasks.add(task_id) area = attrs.get('area') if area: if area in found_areas: raise ValidationException('Cannot have multiple areas with same name: ' + area) has_classes = len(attrs.get('classes', [])) > 0 if has_classes: classed_areas.append(area) found_areas.add(area) area_end = attrs.get('area_end') if area_end: if area_end == area: raise ValidationException('Cannot have a zero-length area') if area_end in classed_areas: if area_end != classed_areas[-1]: raise ValidationWarning('Classed areas cannot overlap ("{}" and "{}")' .format(classed_areas[-1], area_end)) classed_areas.pop() if is_whole_document: if area_end not in found_areas: raise ValidationWarning('No start found for area "{}"'.format(area_end)) if area_end in found_area_ends: raise ValidationWarning('Area already ended: ' + area_end) found_area_ends.add(area_end) unended_areas = found_areas - found_area_ends if is_whole_document and unended_areas: raise ValidationWarning('Some areas were not ended: ' + str(unended_areas)) return self
def _parse_document(self, options: Optional[DocumentParserOptions]): if options is None: options = DocumentParserOptions() if self._last_setting == options: return self._blocks = [] self._break_on_empty_line = options.break_on_empty_line self._last_setting = options lines = self._doc_text.split("\n") doc = DocReader(lines) funcs = [self.try_parse_code_block, self.try_parse_header_block, self.parse_normal_block] while True: self.eat_whitespace(doc) if not doc.has_more_lines(): break for func in funcs: result = func(doc) if result: result['md'] = result['md'].rstrip().strip('\r\n') if ((result['type'] == 'code' and not options.break_on_code_block) or (result['type'] == 'header' and not options.break_on_header) or (result['type'] == 'autonormal' and not options.break_on_normal)) \ and not result.get('attrs') and len(self._blocks) > 0 \ and not self._blocks[-1].get('attrs', {}).get('plugin') \ and self._blocks[-1]['type'] != 'atom': self._blocks[-1]['md'] += '\n\n' + result['md'] else: if not result.get('attrs'): result['attrs'] = {} self._blocks.append(result) break
[docs] def is_beginning_of_code_block(self, doc): """ :type doc: DocReader """ if doc.peek_line().startswith('```'): code_start_char = '`' elif doc.peek_line().startswith('~~~'): code_start_char = '~' else: return False, None match = re.match('^' + code_start_char + '+', doc.peek_line()).group(0) return True, match
[docs] def is_beginning_of_header_block(self, doc): return doc.peek_line().startswith('#')
[docs] def is_empty_line(self, doc): """ :type doc: DocReader """ return doc.peek_line().isspace() or doc.peek_line() == ''
[docs] def try_parse_code_block(self, doc): """ :type doc: DocReader :rtype: dict """ is_code_block, code_block_marker = self.is_beginning_of_code_block(doc) if not is_code_block: return None start_line = doc.get_line_and_advance() block_lines = [] tokens, start = AttributeParser(start_line).get_attributes() is_atom = tokens.get('atom', False) if is_atom: tokens.pop('atom') else: first_line = start_line[:start].strip() block_lines.append(first_line) line = None while True: if not doc.has_more_lines(): break line = doc.get_line_and_advance() if line.startswith(code_block_marker): break block_lines.append(line) if not is_atom and line is not None and line.startswith(code_block_marker): block_lines.append(line) result = {'md': '\n'.join(block_lines), 'type': 'atom' if is_atom else 'code'} self.extract_attrs(result, tokens) return result
[docs] def try_parse_header_block(self, doc): """ :rtype: dict :type doc: DocReader :param doc: :return: """ if not self.is_beginning_of_header_block(doc): return None header_line = doc.get_line_and_advance() block_lines = [] tokens, start = AttributeParser(header_line).get_attributes() block_type = 'normal' if not header_line.startswith('#-'): block_type = 'header' block_lines.append(header_line[:start].strip()) block_lines.append(self.parse_normal_block(doc)['md']) result = {'md': '\n'.join(block_lines), 'type': block_type} self.extract_attrs(result, tokens) return result
[docs] def parse_normal_block(self, doc): """ :type doc: DocReader """ block_lines = [] while doc.has_more_lines(): if self.is_beginning_of_header_block(doc) \ or self.is_beginning_of_code_block(doc)[0] \ or (self._break_on_empty_line and self.is_empty_line(doc)): break block_lines.append(doc.get_line_and_advance()) return {'md': '\n'.join(block_lines), 'type': 'autonormal'}
[docs] def extract_attrs(self, result, tokens): for builtin in ('id', 't'): if builtin in tokens: result[builtin] = tokens.pop(builtin) if len(tokens) > 0: result['attrs'] = tokens
[docs] def eat_whitespace(self, doc): """ :rtype: NoneType :type doc: DocReader """ while doc.has_more_lines() and self.is_empty_line(doc): doc.get_line_and_advance() return None