1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 '''
32 The module provides the methods for inspecting odt files.
33
34 @author: Vili Auvinen, Juho Tammela, Olli Kauppinen
35 '''
36
37 import conversions
38 import string
39 import ooo_meta_inspector
40 import common_methods
41
43 '''Gets style element by the given style name.
44 It searches first from the file content.xml if doesn't find then searches from the file style.xml.
45
46 @return: The style element if the style exists, otherwise returns None.
47 '''
48 styleElements = documentDict['content.xml'].getElementsByTagName('style:style')
49 for element in styleElements:
50 if element.getAttribute('style:name') == styleName:
51 return element
52
53 styleElements = documentDict['styles.xml'].getElementsByTagName('style:style')
54 styleElements += documentDict['styles.xml'].getElementsByTagName('text:list-style')
55 for element in styleElements:
56 if element.getAttribute('style:name') == styleName:
57 return element
58 return None
59
60
62 '''Gets the style element by the given style name.
63 It searches first from content.xml if doesn't find then searches from style.xml.
64
65 @note: XML:
66 <style:style style:name="Text_20_body" style:display-name="Text body">
67 <style:style style:name="tutkielma">
68
69 Display name --> style name
70 " "-->"_20_"
71 "_"-->"_5f_"
72
73 @return: The element of the style display name if it exists, otherwise returns None.
74 '''
75 styleElements = documentDict['content.xml'].getElementsByTagName('style:style')
76 for element in styleElements:
77 if element.getAttribute('style:display-name').lower() == styleName.lower():
78 return element
79
80 styleElements = documentDict['styles.xml'].getElementsByTagName('style:style')
81 for element in styleElements:
82 if element.getAttribute('style:display-name').lower() == styleName.lower():
83 return element
84 return None
85
87 '''Gets the style display name by the given style name.
88 It uses _getStyleElementByStyleName method to find the style.
89 If the style doesn't have display-name attribute then the display name is just the style name.
90
91 @note: XML example:
92
93 <style:style style:name="Text_20_body" style:display-name="Text body">
94
95 <style:style style:name="tutkielma">
96
97 Display name --> style name
98 " "-->"_20_"
99 "_"-->"_5f_"
100
101 @return: The style display name.
102 '''
103
104 element = _getStyleElementByStyleName(documentDict, styleName)
105 if element.hasAttribute('style:display-name'):
106 return element.getAttribute('style:display-name')
107 else:
108 return styleName
109
110 -def _getMasterPageStyleElement (documentDict, masterPageStyleName):
111 '''Get the master page style element by the given master page style name.
112
113 @return: The master page element.
114 '''
115 masterPageStyles = documentDict['styles.xml'].getElementsByTagName ('style:master-page')
116 for element in masterPageStyles:
117 if masterPageStyleName == element.getAttribute('style:name'):
118 return element
119 return None
120
121 -def _getPageLayoutElement(documentDict, pageLayoutName):
122 '''Gets page layout element by the given page layout name.
123
124 @return: The page layout.
125 '''
126 pageLayoutElements = documentDict['styles.xml'].getElementsByTagName ('style:page-layout')
127 for element in pageLayoutElements:
128 if pageLayoutName == element.getAttribute('style:name'):
129 return element
130 return None
131
132
133 -def getPageMarginals(documentDict):
134 '''Get the page marginals.
135 Searches for only from used master pages.
136
137 @return: The page marginals. If the marginals are different between the used pages,
138 then return false.
139
140 @see: convertCmOrInDictToString
141 '''
142 margins = {'top': None, 'bottom': None, 'right': None, 'left': None}
143 usedMasterPages = _getUsedMasterPageElements(documentDict)
144
145 for element in usedMasterPages:
146 pageLayoutName = element.getAttribute('style:page-layout-name')
147 layoutElement = _getPageLayoutElement(documentDict, pageLayoutName)
148 for key in margins.keys():
149 margin = layoutElement.getElementsByTagName('style:page-layout-properties')[0].getAttribute('fo:margin-' + key)
150
151 if margins [key] is None:
152 margins [key] = margin
153 elif margins [key] != margin:
154 return False
155
156
157
158 return conversions.convertCmOrInDictToString(margins)
159
160 -def getPageSize(documentDict):
161 '''Get the page size.
162
163 @return: The converted page size. If the size is different between the used pages,
164 then returns False.
165
166 @see: convertCmOrInDictToString
167 '''
168 pageSize = {'height': None, 'width': None }
169
170 usedMasterPages = _getUsedMasterPageElements(documentDict)
171
172 for element in usedMasterPages:
173 pageLayoutName = element.getAttribute('style:page-layout-name')
174 layoutElement = _getPageLayoutElement(documentDict, pageLayoutName)
175 for key in pageSize.keys():
176 size = layoutElement.getElementsByTagName('style:page-layout-properties')[0].getAttribute('fo:page-' + key)
177
178 if pageSize [key] is None:
179 pageSize [key] = size
180 if pageSize [key] != size:
181 return False
182
183 return conversions.convertCmOrInDictToString(pageSize)
184
186 ''' Get the all master page elements which are used in document.
187 'Standard' master page style used if there is no other definitions.
188
189 @return: The list of the used master page elements.
190 '''
191 usedMasterPageElements = []
192 usedMasterPageDict = _getAllStyleNamesWithDifferentMasterPage(documentDict)
193 for masterPageName in usedMasterPageDict.values():
194 masterPageElement = _getMasterPageStyleElement(documentDict, masterPageName)
195 usedMasterPageElements.append(masterPageElement)
196
197 if len(usedMasterPageElements) == 0:
198 usedMasterPageElements.append(_getMasterPageStyleElement(documentDict, 'Standard'))
199 return usedMasterPageElements
200
202 '''Get the default style element by the given style family.
203
204 @note:
205
206 Every style is based on style family.
207
208 <style:style style:name="Heading_20_1" style:display-name="Heading 1" style:family="paragraph">
209
210 @param styleFamily: gets wanted default style.
211
212 Style family can be paragraph, graphic, table or table-row.
213
214 @return: default style element.
215 '''
216 defaultStyleElements = documentDict['styles.xml'].getElementsByTagName('style:default-style')
217 for element in defaultStyleElements:
218 if element.getAttribute ('style:family') == styleFamily:
219 return element
220 return None
221
222
223 -def _getMasterPageStyleName(documentDict, styleElement):
224 '''Get the master page style name by the given style element.
225
226 @return: The master page name, if master page is '' then return 'Standard'.
227 '''
228 masterPageName = styleElement.getAttribute ('style:master-page-name')
229 if masterPageName == '':
230 return 'Standard'
231 else:
232 return masterPageName
233
235 '''Checks the empty paragraphs from document.
236 getDocumentPararaphs method gets all paragraphs to be checked for.
237 An empty paragraph is permitted after the table of content and in page break elements.
238
239 @return: The number of the empty paragraphs if efound, otherwise returns False.
240 '''
241 pageBreakStyles = _getPageBreakStyleNames(documentDict)
242 paragraphs = _getDocumentParagraphs(documentDict)
243 emptyParagraphs = 0
244 for element in paragraphs:
245 if not element.childNodes and element.previousSibling.tagName != 'text:table-of-content' and element.getAttribute('text:style-name')not in set(pageBreakStyles):
246 emptyParagraphs = emptyParagraphs + 1
247
248 if emptyParagraphs == 0:
249 return False
250 return emptyParagraphs
251
253 '''Checks double spaces.
254 Checks if the document has text:s tag.
255
256 @note: XML example:
257
258 <text:s text:c="2"/> --> 3 spaces
259
260 <text:s/> --> 2 spaces
261
262 @return: The amount of the double spaces.
263 '''
264 doubleSpaces = documentDict['content.xml'].getElementsByTagName('text:s')
265
266 if len(doubleSpaces) == 0:
267 return False
268
269 return len(doubleSpaces)
270
272 '''Checks tabulators from the document.
273 getDocumentPararaphs method gets the all paragraphs to be checked for.
274
275 @return: The number of the tabulators if found, otherwise returns False.
276 '''
277 paragraphs = _getDocumentParagraphs(documentDict)
278 tabs = 0
279 for element in paragraphs:
280 elementListLength = len(element.getElementsByTagName('text:tab'))
281 if elementListLength != 0:
282 tabs = tabs + elementListLength
283 if tabs == 0:
284 return False
285
286 return tabs
287
289 '''Checks asterisk from the document.
290 getDocumentPararaphs method gets all paragraphs to check for.
291
292 @return: The number of the asterisks if found, otherwise returns False.
293 '''
294 paragraphs = _getDocumentParagraphs(documentDict)
295 asterisk = 0
296 for element in paragraphs:
297 if common_methods.checkStringFromContent(element, '*') is True:
298 asterisk += 1
299 if asterisk == 0:
300 return False
301 return asterisk
302
304 '''Gets all the paragraphs from the document.
305 Including all text-p(text paragraphs) and text-h (headings) elements.
306 It is used in checkTabs and checkEmptyParagraphs.
307
308 @return: The list of the used elements.
309 '''
310
311 elementList = []
312 officeTextElement = documentDict['content.xml'].getElementsByTagName ('office:text')[0]
313 elements = officeTextElement.childNodes
314 for element in elements:
315 if element.nodeName == 'text:p' or element.nodeName == 'text:h':
316 elementList.append(element)
317 return elementList
318
320 '''Gets all used style elements.
321 In the file content.xml the element office:body contains the used styles.
322
323 @return: The element list of the used styles.
324 '''
325
326 usedStyleElements = []
327
328 bodyElements = common_methods.getDescendants(documentDict['content.xml'].getElementsByTagName('office:body')[0], [])
329
330 for element in bodyElements:
331 if element.nodeType is not element.TEXT_NODE and element.hasAttribute('text:style-name'):
332 if len(common_methods.getTextContent(element)) != 0:
333 usedStyleElements.append(element)
334
335 return usedStyleElements
336
338 '''Gets all used style names.
339 Gets the parent style of PI-style (I is integer value) like P1 is Heading_20_1.
340
341 @return: The list of all the style names.
342 '''
343 usedStyleNames = []
344 for element in _getListOfUsedStyleElements (documentDict):
345 styleName = element.getAttribute('text:style-name')
346 if styleName [0] == 'P' and styleName[1].isdigit():
347 styleElement = _getStyleElementByStyleName(documentDict, styleName)
348 if styleElement.hasAttribute('style:parent-style-name'):
349 usedStyleNames.append(styleElement.getAttribute('style:parent-style-name'))
350 else:
351 usedStyleNames.append(element.getAttribute('text:style-name'))
352
353 usedStyleNames = list(set(usedStyleNames))
354 usedStyleNames.sort()
355
356 return usedStyleNames
357
359 '''Checks if the given style element contains the page break.
360
361 @return: The style element if contains the page break, otherwise returns False.
362 '''
363
364 hasParagraphProperties = styleElement.getElementsByTagName('style:paragraph-properties')
365 if hasParagraphProperties:
366 if hasParagraphProperties[0].getAttribute('fo:break-before') == 'page':
367 return styleElement
368 return False
369
371 '''Gets the names of the styles which contains the page break.
372
373 @return: The list of page break style names.
374 '''
375 pageBreakStylesNames = []
376 styleElements = documentDict['content.xml'].getElementsByTagName('style:style')
377 styleElements += documentDict['styles.xml'].getElementsByTagName('style:style')
378 for element in styleElements:
379 pageBreak = _checkPageBreakStyleElement(element)
380 if pageBreak is not False:
381 pageBreakStylesNames.append(element.getAttribute('style:name'))
382
383 return pageBreakStylesNames
384
386 '''Gets all the style names which changes the master page.
387 The master page will change when a style has master-page-name attribute
388 and its is nonempty. If is empty ("") then master page is standard and
389 if has no attribute with same as previous master page.
390
391 @note: masterPageDict: contains a key as a style name and value as a master page name.
392
393 @return: The dictionary of the styles which changes master page.
394 '''
395 styleElements = documentDict['content.xml'].getElementsByTagName('style:style')
396 styleElements += documentDict['styles.xml'].getElementsByTagName('style:style')
397
398 masterPageDict = {}
399
400 for element in styleElements:
401 if element.getAttribute('style:master-page-name') != '':
402 masterPageDict[element.getAttribute('style:name')] = element.getAttribute('style:master-page-name')
403 return masterPageDict
404
406 '''Gets section break elements from the document.
407 Finds all the elements (including text, list, heading...) which chance the section.
408
409 @return: The list of the elements which changes the section.
410 '''
411 usedStyleElementsInDocument = _getListOfUsedStyleElements(documentDict)
412 masterPagesDict = _getAllStyleNamesWithDifferentMasterPage(documentDict)
413 sectionBreakElements = []
414 for element in usedStyleElementsInDocument:
415 if masterPagesDict.has_key(element.getAttribute('text:style-name')):
416 sectionBreakElements.append(element)
417 return sectionBreakElements
418
420 '''Gets table of content.
421 Each TOC entry is own entry in tocList.
422
423 @return: The list of the elements in table of content.
424 '''
425 tocList = []
426 if checkTOC(documentDict) is True:
427 toc = documentDict['content.xml'].getElementsByTagName('text:table-of-content')
428 tocTextElements = toc[0].getElementsByTagName ('text:p')
429 for element in tocTextElements:
430 if element.parentNode.nodeName == 'text:index-title':
431 tocTitle = common_methods.getTextContent(element)
432 else:
433 tocList.append(common_methods.getTextContent(element))
434 return tocList
435
436 -def checkTocContent(documentDict):
437 '''Compares document headings to the TOC entries.
438
439 @return: True if all entries matches otherwise returns an error message.
440 '''
441
442 tocList = _getTOC(documentDict)
443 headingList = _getHeadingList(documentDict)['headings']
444
445 if len(tocList) != len(headingList):
446 return False
447 i = 0
448 while i < len(tocList):
449
450 if headingList[i] not in tocList[i]:
451 return False
452 i += 1
453 return True
454
455
456
458 '''Checks if the document contains the table of contents.
459
460 @return: True if there is the table of content, otherwise returns False.
461 '''
462 toc = documentDict['content.xml'].getElementsByTagName('text:table-of-content')
463 if len(toc) == 0:
464 return False
465 else:
466 return True
467
469 '''Checks if the document have the alphabetical index.
470
471 @return: True if the alphabetical index exists otherwise returns False.
472 '''
473 aIndex = documentDict['content.xml'].getElementsByTagName('text:alphabetical-index')
474 if len(aIndex) == 0: return False
475 return True
476
478 '''Gets marked alphabetical index entries from the document.
479
480 @return: The content list of the alphabetical index entries.
481 '''
482 indexContentList = []
483 indexMarks = documentDict['content.xml'].getElementsByTagName ('text:alphabetical-index-mark-start')
484 for element in indexMarks:
485 indexContentList.append (element.nextSibling.nodeValue)
486 return indexContentList
487
488 -def _getIndexContent(documentDict):
489 '''Gets the alphabetical index content.
490 Each alphabetical index entry is an own entry in the list.
491
492 @return: The list of the alphabetical index content.
493 '''
494 alphabeticalIndexList = []
495 if checkIndex(documentDict) is True:
496 aIndex = documentDict['content.xml'].getElementsByTagName('text:alphabetical-index')[0]
497 aIndexTextElements = aIndex.getElementsByTagName('text:p')
498 for element in aIndexTextElements:
499 alphabeticalIndexList.append(common_methods.getTextContent(element))
500 return alphabeticalIndexList
501
502 -def checkIndexContent(documentDict):
503 '''Compares the document marked texts to the alphabetical index entries.
504
505 @return: True if all entries matches otherwise returns an error code.
506 '''
507 indexList = _getIndexContent(documentDict)
508 markedList = _getIndexContentFromDocument(documentDict)
509 if len(indexList) == 0:
510 return '3'
511
512 for markedItem in markedList:
513 for indexItem in indexList:
514 found = False
515
516 if markedItem in indexItem:
517 found = True
518 break
519 if found is False:
520 return '4'
521
522 return True
523
525 '''Gets all headings from the document and the used outline level.
526 Each heading is an own entry in the list.
527
528 @return: The dictionary ['headings'] contains a list of headings and ['level'] contains the value of the highest used heading outline level.
529 '''
530 headingList = []
531 headingOutlineLevel = 0
532 headings = documentDict['content.xml'].getElementsByTagName('text:h')
533 for element in headings:
534 heading = common_methods.getTextContent(element)
535 headingList.append(heading)
536 if headingOutlineLevel < element.getAttribute('text:outline-level'):
537 headingOutlineLevel = element.getAttribute('text:outline-level')
538 return {'headings':headingList, 'level':headingOutlineLevel}
539
541 '''Checks if the document has a table.
542
543 @return: True if there is a table and False if not.
544 '''
545 table = documentDict['content.xml'].getElementsByTagName('table:table')
546 if len(table) == 0:
547 return True
548 else:
549 return True
550
551
553 '''Gets tables in dictionary.
554 Every table is own entry in tablesDict (key = table1,table2...)
555 Every tableDict has table's cell address as key (A1,A2...) and cell value as dictionary's value.
556
557 @return: The dictionary of the table dictionaries.
558 '''
559 tablesDict={}
560 tableNumber = 0
561 if checkTable (documentDict) is True:
562 tableElements = documentDict['content.xml'].getElementsByTagName('table:table')
563 for tableElement in tableElements:
564 tableDict = {}
565
566 tableRowElements = tableElement.getElementsByTagName('table:table-row')
567 rowIndex = 0
568 for row in tableRowElements:
569 rowIndex += 1
570 rowCellElements = row.getElementsByTagName('table:table-cell')
571 columnIndex = ord('A') - 1
572 for cell in rowCellElements:
573 columnIndex += 1
574 index = chr(columnIndex) + str(rowIndex)
575 tableDict[index] = common_methods.getTextContent(cell)
576 tableNumber +=1
577 tablesDict['table'+str(tableNumber)]=tableDict
578 return tablesDict
579 return False
580
581
582
584 '''Checks page number format by given element and master page element.
585
586 @param masterPageElement: the master page element to look for.
587 @param element: a footer or a header element.
588
589 @return: The number format if it exists, otherwise returns False.
590
591 The number format is optionally in the element (footer or header). If the number format
592 is not in the element then the page-layout element defines number format.
593 '''
594 pageLayoutElement = _getPageLayoutElement(documentDict, masterPageElement.getAttribute('style:page-layout-name'))
595 pageNumberElements = element.getElementsByTagName('text:page-number')
596 if pageNumberElements:
597 if pageNumberElements[0].hasAttribute('style:num-format'):
598 numFormat = pageNumberElements[0].getAttribute('style:num-format')
599 else:
600 numFormat = pageLayoutElement.getElementsByTagName('style:page-layout-properties')[0].getAttribute('style:num-format')
601 return numFormat
602
603 return False
604
606 '''Gets the author and the number format from the header and the footer.
607
608 @param masterPageElement: the master page element to look for.
609
610 @return: The dictionary which contains the author and the page number format.
611 '''
612
613 meta = ooo_meta_inspector.getMeta(documentDict)
614 footer = masterPageElement.getElementsByTagName ('style:footer')
615 header = masterPageElement.getElementsByTagName ('style:header')
616
617 authorAndNumberDict = {'headerPageNumber':None, 'headerAuthor': None, 'footerPageNumber': None, 'footerAuthor': None}
618
619 if footer:
620 if checkPageNumberFromFooterAndHeader(documentDict, masterPageElement, footer[0]) is not False:
621 authorAndNumberDict['footerPageNumber'] = checkPageNumberFromFooterAndHeader(documentDict, masterPageElement, footer[0])
622 if common_methods.checkStringFromContent(footer[0], meta['dc:creator']):
623 authorAndNumberDict['footerAuthor'] = meta['dc:creator']
624
625 if header:
626 if checkPageNumberFromFooterAndHeader(documentDict, masterPageElement, header[0]) is not False:
627 authorAndNumberDict['headerPageNumber'] = checkPageNumberFromFooterAndHeader(documentDict, masterPageElement, header[0])
628
629 if common_methods.checkStringFromContent(header[0], meta['dc:creator']):
630 authorAndNumberDict['headerAuthor'] = meta['dc:creator']
631
632 return authorAndNumberDict
633
634
636 '''Checks the outline style.
637 Level is highest used headings outline level. Normally Heading 1 should be 1 and Heading 2 should be 2.
638
639 @note: XML example:
640
641 <text:outline-style style:name="Outline">
642
643 <text:outline-level-style text:level="1" style:num-format="1">
644
645 <style:list-level-properties text:list-level-position-and-space-mode="label-alignment">
646
647 <style:list-level-label-alignment text:label-followed-by="listtab" text:list-tab-stop-position="0.762cm" fo:text-indent="-0.762cm" fo:margin-left="0.762cm"/>
648
649 </style:list-level-properties>
650
651 </text:outline-level-style>
652
653 <text:outline-level-style text:level="2" style:num-format="1" text:display-levels="2">
654
655 <style:list-level-properties text:list-level-position-and-space-mode="label-alignment">
656
657 <style:list-level-label-alignment text:label-followed-by="listtab" text:list-tab-stop-position="1.016cm" fo:text-indent="-1.016cm" fo:margin-left="1.016cm"/>
658
659 </style:list-level-properties>
660
661 </text:outline-level-style>
662
663 <text:outline-level-style text:level="3" style:num-format="">
664
665 <style:list-level-properties text:list-level-position-and-space-mode="label-alignment">
666
667 <style:list-level-label-alignment text:label-followed-by="listtab" text:list-tab-stop-position="1.27cm" fo:text-indent="-1.27cm" fo:margin-left="1.27cm"/>
668
669 </style:list-level-properties>
670
671 </text:outline-level-style>
672
673 ...
674
675 </text:outline-style>
676
677 @return: True if ok, False if not.
678 '''
679
680 outlineStyleElements = documentDict['styles.xml'].getElementsByTagName('text:outline-style')
681
682 if len(outlineStyleElements) == 0:
683 outlineStyleElement = _getStyleElementByStyleName(documentDict, 'Outline')
684 outlineLevels = outlineStyleElement.getElementsByTagName('text:list-level-style-number')
685 else:
686 outlineStyleElement = outlineStyleElements[0]
687 outlineLevels = outlineStyleElement.getElementsByTagName('text:outline-level-style')
688
689
690 level = _getHeadingList(documentDict)['level']
691 index = 0
692 for element in outlineLevels:
693 index = index + 1
694 if index > int(level):
695 return True
696 if int(element.getAttribute('text:level')) == index and element.getAttribute('style:num-format') != "":
697 "ok"
698 else:
699 return False
700
701
703 '''Gets the image paths.
704 Checks if the document have an image.
705 Images are located in the picture folder.
706
707 @return: The founded paths of the images in the list, otherwise returnsFalse.
708 '''
709 imagePathList = []
710 if checkImages(documentDict) is True:
711 imageElements = documentDict['content.xml'].getElementsByTagName ('draw:image')
712 for element in imageElements:
713 imagePath = element.getAttribute ('xlink:href')
714 imagePathList.append(imagePath)
715
716 if len(imagePathList)==0:
717 return False
718 return imagePathList
719
720
722 '''Checks if the document contains an image.
723
724 @return: True if there is an image, otherwise False.
725 '''
726 imageElements = documentDict['content.xml'].getElementsByTagName ('draw:image')
727 if len(imageElements) == 0:
728 return False
729 else:
730 return True
731
733 '''Checks if the document contains a list.
734
735 @return: True if there is a list, otherwise False.
736 '''
737 listElements = documentDict['content.xml'].getElementsByTagName ('text:list')
738 if len(listElements) == 0:
739 return False
740
741 return True
742
743
745 '''Prints the lists of the document.
746
747 @todo: getListContent
748 '''
749 if checkList(documentDict) is True:
750 listElements = documentDict['content.xml'].getElementsByTagName ('text:list')
751 for element in listElements:
752 print 'Lista tehty', element.getAttribute('text:style-name'), '-tyylillä, lista:'
753 listContent = element.getElementsByTagName ('text:list-item')
754 for text in listContent:
755 print "-", common_methods.getTextContent(text)
756
757
759 '''Gets objects paths.
760 Searches if the document have an image.
761
762 @return: The object path list if founds an image, otherwise an error message
763 '''
764 objectPathList = []
765 objectElements = documentDict['content.xml'].getElementsByTagName ('draw:object')
766 if objectElements:
767 for element in objectElements:
768 objectPath = element.getAttribute ('xlink:href')
769 objectPathList.append(objectPath)
770 return objectPathList
771 else:
772 return False
773
775 '''Checks if the given style is used in the document.
776
777 @return: True if used, otherwise return False.
778 '''
779 usedStyleList = _getListOfUsedStyleNames (documentDict)
780 if styleName in set(usedStyleList):
781 return True
782 return False
783
785 '''Get style defination attributes by given style name.
786 parentStyleList is for executing the inheritation of styles.
787
788 @note: Inheritation of the styles:
789
790 default paragraph -style-> standard-style -> style(Text body) -> P-style -> T-style
791
792 @note: XML example (styles.xml):
793
794 <style:style style:name="Standard" style:family="paragraph" style:class="text">
795
796 <style:paragraph-properties fo:orphans="2" fo:widows="2" style:writing-mode="lr-tb"/>
797
798 <style:text-properties style:use-window-font-color="true" style:font-name="Courier New" fo:font-size="10pt" fo:language="fi" fo:country="FI" style:font-name-asian="Times New Roman" style:font-size-asian="10pt" style:font-name-complex="Times New Roman" style:font-size-complex="10pt" style:language-complex="ar" style:country-complex="SA"/>
799
800 </style:style>
801
802 <style:style style:name="Text_20_body" style:display-name="Text body" style:family="paragraph" style:parent-style-name="Standard" style:class="text" style:master-page-name="">
803
804 <style:paragraph-properties fo:margin-left="1cm" fo:margin-right="0cm" fo:margin-top="0.247cm" fo:margin-bottom="0.247cm" fo:text-indent="0cm" style:auto-text-indent="false" style:page-number="auto" fo:break-before="auto" fo:break-after="auto"/>
805
806 <style:text-properties style:font-name="Tahoma"/>
807
808 </style:style
809
810 @note: XML example (content.xml):
811
812 <style:style style:name="P2" style:family="paragraph" style:parent-style-name="Text_20_body">
813
814 <style:paragraph-properties fo:text-align="start" style:justify-single-word="false"/>
815
816 </style:style>
817
818 @return: The style defination dictionary.
819
820 '''
821 originalStyleName = styleName
822 styleNameLower = styleName.lower()
823
824 if 'body' in styleNameLower:
825 styleName = 'Text_20_body'
826 if _getStyleElementByStyleName(documentDict, styleName) is None:
827 if _getStyleElementByDisplayName(documentDict, styleName) is None:
828 return False
829 else: styleName = _getStyleElementByDisplayName(documentDict, styleName).getAttribute ('style:name')
830
831 if _isStyleUsed(documentDict, styleName) is False:
832 return False
833
834 defaultStyleElement = _getDefaultStyleElement(documentDict, 'paragraph')
835
836 fontName = defaultStyleElement.getElementsByTagName('style:text-properties')[0].getAttribute('style:font-name')
837 fontSize = defaultStyleElement.getElementsByTagName('style:text-properties')[0].getAttribute('fo:font-size')
838 language = defaultStyleElement.getElementsByTagName('style:text-properties')[0].getAttribute('fo:language')
839
840
841 styleAttributeDict = {
842 'style:name':styleName,
843 'style:parent-style-name':None,
844 'fo:language':language,
845 'style:font-name':fontName,
846 'fo:font-size':fontSize,
847 'fo:text-transform':None,
848 'fo:text-indent':'0',
849 'fo:line-height':'100%',
850 'fo:margin-left':0,
851 'fo:margin-right':0,
852 'fo:margin-top':0,
853 'fo:margin-bottom':0,
854 'fo:keep-with-next':False,
855 'fo:text-align': 'start',
856 'fo:orphans':None,
857 'fo:widows': None,
858 'fo:font-style': False,
859 'fo:font-weight':False}
860
861 translateDict = {
862 'style:name': 'styleName',
863 'style:font-name':'fontName',
864 'fo:font-size':'fontSize',
865 'fo:text-transform':'transform',
866 'fo:margin-left':'indentLeft',
867 'fo:margin-right':'indentRight',
868 'fo:text-indent':'indentFirstLine',
869 'fo:line-height':'linespacing',
870 'fo:margin-top':'spacingBefore',
871 'fo:margin-bottom':'spacingAfter',
872 'fo:keep-with-next':'keepWithNext',
873 'fo:text-align': 'alignment',
874 'fo:font-style': 'italic',
875 'fo:font-weight':'bold',
876 'widowControl':'widowControl'
877 }
878 styleDict = {}
879
880 parentStyleList = _getParentStyleList (documentDict, styleName)
881
882
883 while parentStyleList:
884 styleAttributeDict['style:name'] = parentStyleList.pop()
885 styleAttributeDict = _getStyleAttributes (documentDict, styleAttributeDict)
886
887
888 styleAttributeDict['fo:margin-top'] = conversions.convertCmOrInToPt(styleAttributeDict['fo:margin-top'])
889 styleAttributeDict['fo:margin-bottom'] = conversions.convertCmOrInToPt(styleAttributeDict['fo:margin-bottom'])
890 styleAttributeDict['style:name'] = originalStyleName
891 styleAttributeDict['fo:line-height'] = conversions.convertPercentToDecimal(styleAttributeDict['fo:line-height'])
892 styleAttributeDict['fo:font-size'] = str(round(float(styleAttributeDict['fo:font-size'].split('pt')[0]), 1))
893 styleAttributeDict['fo:text-indent'] = conversions.convertCmOrInToString(styleAttributeDict['fo:text-indent'])
894 styleAttributeDict['fo:margin-left'] = conversions.convertCmOrInToString(styleAttributeDict['fo:margin-left'])
895 styleAttributeDict['fo:margin-right'] = conversions.convertCmOrInToString(styleAttributeDict['fo:margin-right'])
896 if styleAttributeDict['fo:font-weight'] == 'bold': styleAttributeDict['fo:font-weight'] = True
897 elif styleAttributeDict['fo:font-weight'] == 'normal': styleAttributeDict['fo:font-weight'] = False
898 if styleAttributeDict['fo:font-style'] == 'italic': styleAttributeDict['fo:font-style'] = True
899 elif styleAttributeDict['fo:font-style'] == 'normal': styleAttributeDict['fo:font-style'] = False
900 if styleAttributeDict['fo:keep-with-next'] == 'always': styleAttributeDict['fo:keep-with-next'] = True
901 if styleAttributeDict['fo:text-align'] == 'justify': styleAttributeDict['fo:text-align'] = 'both'
902 elif styleAttributeDict['fo:text-align'] == 'start': styleAttributeDict['fo:text-align'] = 'left'
903 elif styleAttributeDict['fo:text-align'] == 'end': styleAttributeDict['fo:text-align'] = 'right'
904
905
906 if styleAttributeDict['fo:widows'] >= 2 and styleAttributeDict['fo:orphans'] >= 2:
907 styleAttributeDict['widowControl'] = True
908 else:
909 styleAttributeDict['widowControl'] = False
910
911
912
913 for key in translateDict.keys():
914 styleDict[translateDict[key]] = styleAttributeDict[key]
915
916 return styleDict
917
919 '''Searches if the style have the wanted attribute if it have then replace attribute value, otherwise keep old value.
920 The style attribute list contains all the relevant style information.
921
922 @return: The list of the styles attributes.
923 '''
924 styleElement = _getStyleElementByStyleName(documentDict, styleAttributeList['style:name'])
925
926 if styleElement:
927 paragraphPropertiesElement = styleElement.getElementsByTagName('style:paragraph-properties')
928 textPropertiesElement = styleElement.getElementsByTagName('style:text-properties')
929
930 for attribute in styleAttributeList.keys():
931 if styleElement.hasAttribute (attribute):
932 styleAttributeList[attribute] = styleElement.getAttribute (attribute)
933
934 if paragraphPropertiesElement:
935 for attribute in styleAttributeList.keys():
936 if paragraphPropertiesElement[0].hasAttribute (attribute):
937 styleAttributeList[attribute] = paragraphPropertiesElement[0].getAttribute (attribute)
938
939 if textPropertiesElement:
940 for attribute in styleAttributeList.keys():
941 if textPropertiesElement[0].hasAttribute (attribute):
942 if attribute == 'fo:font-size' and textPropertiesElement[0].getAttribute(attribute).endswith('%'):
943 styleAttributeList[attribute] = str(int(styleAttributeList[attribute].split('pt')[0]) * int(textPropertiesElement[0].getAttribute(attribute).split('%')[0]) / 100)
944 else:
945 styleAttributeList[attribute] = textPropertiesElement[0].getAttribute(attribute)
946
947 return styleAttributeList
948
949
951 '''Gets the parent style list for the given style name.
952
953 @return: The list of parent styles (lists first entry is style itself).
954 '''
955 parentStyleList = [styleName]
956
957 while _checkParentStyle(documentDict, styleName):
958 styleName = _checkParentStyle(documentDict, styleName)
959 parentStyleList.append(styleName)
960
961 return parentStyleList
962
963
965 '''Checks if the style have a parent style.
966
967 @return: The parent style name.
968 '''
969 try:
970 return _getStyleElementByStyleName(documentDict, styleName).getAttribute('style:parent-style-name')
971 except:
972 return None
973
988
989
991 '''Checks the caption and the reference of the image.
992
993 @return: True if the document images have caption and reference, otherwise False.
994 '''
995 caption = False
996 reference = False
997 imageElements = documentDict['content.xml'].getElementsByTagName ('draw:image')
998 if imageElements:
999 for element in imageElements:
1000 captionNode = element.parentNode.parentNode
1001 if len(common_methods.getTextContent(captionNode)) > 0:
1002 caption = True
1003 imagesReferenceElements = captionNode.getElementsByTagName('text:sequence')
1004 if imagesReferenceElements:
1005 reference = True
1006
1007 if caption is False and reference is False:
1008 return False
1009 elif caption is False:
1010 return False
1011 elif reference is False:
1012 return False
1013 return True
1014
1015
1016 -def checkCoverPage(documentDict):
1017 '''Checks that the front page is done correctly
1018
1019 @return: The cover definitions in a dictionary.
1020
1021 @keyword title: True if the title in cover page is the same as in the document meta.
1022 @keyword name: True if the cover page contains the same author name as in the document meta.
1023 @keyword email: True if the cover page contains e-mail address.
1024
1025 '''
1026 elementList = _getSectionElements(documentDict, 'cover')
1027 if elementList is None:
1028 return False
1029 meta = ooo_meta_inspector.getMeta(documentDict)
1030 cover = {
1031 'title':False,
1032 'name':False,
1033 'email' :False
1034 }
1035 for element in elementList:
1036 if common_methods.checkIfEmailAddress (element):
1037 cover['email'] = True
1038 if common_methods.checkStringFromContent(element, meta['dc:creator']):
1039 cover['name'] = True
1040 if common_methods.checkStringFromContent(element, meta['dc:title']):
1041 cover['title'] = True
1042 return cover
1043
1044
1045
1046
1048 '''Gets the page number format and the author name from the document.
1049
1050 @param section: can have a value 'cover', 'toc' or 'text'.
1051
1052 @return: The dictionary which contains the author and the page number information.
1053 '''
1054 sectionList = _getSectionBreakElements(documentDict)
1055 sectionBrakeElements = {'cover':sectionList[0], 'toc':sectionList[1], 'text':sectionList[2]}
1056
1057 styleName = sectionBrakeElements[section].getAttribute('text:style-name')
1058
1059 styleElement = _getStyleElementByStyleName(documentDict, styleName)
1060
1061 masterPageStyleName = _getMasterPageStyleName(documentDict, styleElement)
1062 masterPageStyleElement = _getMasterPageStyleElement (documentDict, masterPageStyleName)
1063 authorAndNumberDict = getAuthorAndPageNumberFormat(documentDict, masterPageStyleElement)
1064
1065
1066 authorAndNumberDict['numStart'] = styleElement.getElementsByTagName('style:paragraph-properties')[0].getAttribute('style:page-number')
1067 return authorAndNumberDict
1068
1069
1131
1132
1134 '''Gets the elements of the wanted section.
1135 The section break elements changes the section.
1136 Searches trough the whole document.
1137 Adds each element to right section in sectionElements dictionary.
1138 When finds section break element then changes the dictionary to next section.
1139 First list elements to cover-section.
1140 Second list elements to toc-section.
1141 And last list element to text-section.
1142 Document have to have atleast 3 sections.
1143
1144 @return: The section elements in the list.
1145 '''
1146 sectionBreakElementList = _getSectionBreakElements(documentDict)
1147 sectionElements = {'cover':None, 'toc':None, 'text':None}
1148 sectionList = []
1149 officeBodyElement = documentDict['content.xml'].getElementsByTagName ('office:body')[0]
1150 documentElementList = officeBodyElement.firstChild.childNodes
1151 i = 0
1152 k = len(documentElementList)
1153 if len(sectionBreakElementList) < 3:
1154 return None
1155 while not documentElementList[i].isSameNode(sectionBreakElementList[1]):
1156 sectionList.append(documentElementList[i])
1157 i += 1
1158 sectionElements ['cover'] = sectionList
1159 sectionList = []
1160 while not documentElementList[i].isSameNode(sectionBreakElementList[2]):
1161 sectionList.append(documentElementList[i])
1162 i += 1
1163 sectionElements ['toc'] = sectionList
1164 sectionList = []
1165 while i < k:
1166 sectionList.append(documentElementList[i])
1167 i += 1
1168 sectionElements ['text'] = sectionList
1169
1170 return sectionElements[section]
1171
1172
1174 '''Checks that the document sections have been made correctly.
1175 If the amount of the section breaks is not over 3 then return the error message list.
1176
1177 @return: True if the sections are ok, return errorList if not ok.
1178 '''
1179 toc = False
1180 cover = True
1181 sections = len(_getSectionBreakElements (documentDict))
1182 if sections < 3:
1183 return False
1184 coverElements = _getSectionElements(documentDict, 'cover')
1185 for coverElement in coverElements:
1186 if coverElement.nodeName == 'text:table-of-content':
1187 cover = False
1188 errorList.append('cover')
1189
1190 tocElements = _getSectionElements(documentDict, 'toc')
1191 for tocElement in tocElements:
1192 if tocElement.nodeName == 'text:table-of-content':
1193 toc = True
1194 break
1195 if toc is False:
1196 errorList.append('toc')
1197
1198 if toc and cover is True:
1199 return True
1200 else:
1201 return errorList
1202
1203
1212
1221
1229
1237
1238
1239
1241 '''Goes through all the elements in the document which have used any style.
1242 Checks that elements are using the correct styles (i.e. not Standard or Default style) and that no manual style definitions are made (like T1).
1243
1244 '''
1245
1246 illegalStyles = []
1247 for styleElement in _getListOfUsedStyleElements(documentDict):
1248 styleName = styleElement.getAttribute('text:style-name')
1249 if styleName[0] == 'T' and styleName[1].isdigit():
1250 illegalStyles.append(styleName)
1251
1252 elif styleName[0] == 'P' and styleName[1].isdigit():
1253 styleElement = _getStyleElementByStyleName(documentDict, styleName)
1254 for child in styleElement.childNodes:
1255 if child.hasAttribute('fo:font-style')or child.hasAttribute('fo:text-align')or child.hasAttribute('fo:font-weight'):
1256 illegalStyles.append(styleName)
1257
1258 for element in _getListOfUsedStyleElements(documentDict):
1259 if element.getAttribute('text:style-name') in set(illegalStyles):
1260 errorIdsAndPositions['manualChanges'].append(common_methods.getTextContent(element)[:30])
1261 elif element.getAttribute('text:style-name') == 'Standard':
1262 errorIdsAndPositions['styleNotUsed'].append(common_methods.getTextContent(element)[:30])
1263