1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 '''
32 The module provides the methods for inspecting docx files.
33
34 @author: Vili Auvinen, Juho Tammela
35 '''
36
37 from common_methods import *
38 from math import fabs
39 from conversions import convertTwipToCm, convertTwipToPt
40
42 '''Gets a style element by the style id from the styles.xml.
43
44 The style Id links a paragraph using a style in document.xml to the right style in styles.xml.
45 Style id can be in different languages depending on what language the Word was that wrote the document.
46
47 @note: XML example:
48
49 <w:p>
50 <w:pPr>
51 <w:pStyle w:val="Otsikko1"/> (This is the style id.)
52 </w:pPr>
53 <r>
54 ...
55 </r>
56 </w:p>
57
58
59 @param styleId: The style id.
60 @param styleXml: styles.xml as a DOM tree.
61
62 @return: The style element with a given style id or None if no matching style element was found.
63 '''
64
65
66 styleElements = styleXml.getElementsByTagName('w:style')
67 for element in styleElements:
68 if (element.getAttribute('w:styleId') == styleId):
69 return element
70
71
72 return None
73
75 '''Gets a style element by a style name from styles.xml.
76
77 Style name is found in the styles.xml.
78 A style has always the same name regardless of the language of the Word that wrote the document.
79
80 @note: XML example:
81
82 <w:style w:type="paragraph" w:styleId="Otsikko1"> (This is the style id.)
83 <w:name w:val="heading 1"/> (Here is the style name.)
84 ...
85 </w:style>
86
87 @param styleName: The style name.
88 @param styleXml: styles.xml as a DOM tree.
89
90 @return: The style-element with a given style name, or None if no matching style element was found.
91 '''
92
93 nameElements = styleXml.getElementsByTagName('w:name')
94 for element in nameElements:
95 if (element.getAttribute('w:val') == styleName):
96 return element.parentNode
97 return None
98
100 '''Get the based-on style style id for a given style from styles.xml.
101
102 @param styleName: The style name of the style that's based-on style id is wanted.
103 @param styleXml: styles.xml as a DOM tree.
104
105 @return: The id of the based on style for a given style name, or None if there was no based on style.'''
106
107 if _getStyleElementByName(styleName, styleXml) is not None:
108 try:
109 return _getStyleElementByName(styleName, styleXml).getElementsByTagName('w:basedOn')[0].getAttribute('w:val')
110 except:
111 return None
112 return None
113
115 '''Get a style name of the style element.
116
117 @param styleElement: The style element whose style name is wanted.
118
119 @return: The style name of a given style element.'''
120
121 return styleElement.getElementsByTagName('w:name')[0].getAttribute('w:val')
122
124 '''Get the name of a style with a given style id.
125
126 @param styleId: The style id to be looked for.
127 @param styleXml: styles.xml as a DOM tree.
128
129 @return: The style name of the style with the correct style id, or None if it wasn't found.
130 '''
131 styleElement = _getStyleElementById(styleId, styleXml)
132 if styleElement is not None:
133 return _getStyleName(styleElement)
134 return None
135
137 '''Get the id of a style with a given style name from styles.xml.
138
139 @param styleName: The style name to be looked for.
140 @param styleXml: styles.xml as a DOM tree.
141
142 @return: The style id of the style with the correct style name, or None if it wasn't found.
143 '''
144
145 styleElements = styleXml.getElementsByTagName('w:name')
146 for styleElement in styleElements:
147 if (styleElement.getAttribute('w:val').lower() == styleName.lower()):
148 styleId = styleElement.parentNode.getAttribute('w:styleId')
149 return styleId
150 return None
151
153 '''Gets the style id of a paragraph element.
154
155 @param p: The paragraph element.
156
157 @return: The style id if it was found, otherwise returns None.
158 '''
159 try:
160 styleId = p.getElementsByTagName('w:pStyle')[0].getAttribute('w:val')
161 except:
162 return None
163 return styleId
164
165
167 '''Gets a themefont from theme1.xml.
168
169 @note: XML example:
170
171 <a:fontScheme name="Office">
172 <a:majorFont>
173 <a:latin typeface="Cambria"/>
174 <a:ea typeface=""/>
175 <a:cs typeface=""/>
176 </a:majorFont>
177 <a:minorFont>
178 <a:latin typeface="Calibri"/>
179 <a:ea typeface=""/>
180 <a:cs typeface=""/>
181 </a:minorFont>
182 </a:fontScheme>
183
184 @param themeXml: theme1.xml as DOM tree.
185 @param styleDefinitions: The style definitions dict.
186 @see: _getCompleteStyleDefinitions.
187 @param themeFont: The theme font. Should be either 'majorFont' or 'minorFont'.
188
189 @return: The style definitions with or without changes.
190
191 '''
192 if themeFont == "" or themeFont is None:
193 return styleDefinitions
194
195 fontElement = None
196
197 if themeFont.startswith('major'):
198 fontElement = themeXml.getElementsByTagName('a:majorFont')[0]
199 elif themeFont.startswith('minor'):
200 fontElement = themeXml.getElementsByTagName('a:minorFont')[0]
201
202 if fontElement is not None:
203 themeFont = fontElement.getElementsByTagName('a:latin')[0].getAttribute('typeface')
204 if themeFont.strip() != "":
205 styleDefinitions['w:ascii'] = themeFont
206 styleDefinitions['w:asciiTheme'] = None
207
208 return styleDefinitions
209
211 ''' Return style definitions of a given element.
212 First checks if the element has any children and uses recursion if some are found.
213 Next checks if the element has attributes.
214 - If the attribute name is a key in the dict, stores the value of the attribute.
215 - If the attribute name is 'w:val' and the element tag name is a key in the dict, stores the value of the attribute.
216 If the element tag name is a key in the dict and the element doesn't have any attributes or children, stores value '1' in the dict.
217
218 @param element: Style definitions are searched inside this element
219 @param styleDefinitions: A dict where the style definitions are stored.
220 May contain tag names or attribute names and some default values.
221
222 @return: The style definitions in a dict.
223 '''
224
225
226
227
228
229
230
231
232
233
234
235
236
237 if element.tagName == "w:pBdr":
238 return styleDefinitions
239
240 for child in element.childNodes:
241 if child.nodeType != child.TEXT_NODE:
242 styleDefinitions = _getStyleDefinitions(child, styleDefinitions)
243
244 if element.hasAttributes():
245 for i in range (0, element.attributes.length):
246 attributeName = element.attributes.item(i).name
247 if styleDefinitions.has_key(attributeName):
248 styleDefinitions[attributeName] = element.attributes.item(i).value
249
250
251 elif styleDefinitions.has_key(element.tagName) and attributeName == "w:val":
252 styleDefinitions[element.tagName] = element.attributes.item(i).value
253
254 elif styleDefinitions.has_key(element.tagName) and element.hasChildNodes() is False:
255 styleDefinitions[element.tagName] = True
256
257 return styleDefinitions
258
259
260 -def getStyle(document, requirementStyleName):
261 '''Gets all definitions of a style from document dictionary.
262
263 Converts twips to centimeters.
264
265 @return: A dict with all the style definitions of the one style with the
266 translated keys to match return value odt_inspector's getStyle(). False, if the style was not found.
267 '''
268 styleXml = document['word/styles.xml']
269 themeXml = document['word/theme/theme1.xml']
270
271
272 if _isStyleUsed(document, requirementStyleName) is False:
273 return False
274
275
276
277
278
279
280
281 styleId = _getStyleIdByName(requirementStyleName, styleXml)
282 styleName = _getStyleNameById(styleId, styleXml)
283
284 styleDefinitions = _getCompleteStyleDefinitions(styleXml, styleName, themeXml)
285
286 if styleDefinitions is None:
287 return False
288
289 translateDict = {'w:name':'styleName',
290 'w:ascii':'fontName',
291 'w:sz':'fontSize',
292 'w:caps':'transform',
293 'w:left':'indentLeft',
294 'w:right':'indentRight',
295 'w:firstLine':'indentFirstLine',
296 'w:line':'linespacing',
297 'w:before':'spacingBefore',
298 'w:after':'spacingAfter',
299 'w:keepNext':'keepWithNext',
300 'w:jc':'alignment',
301 'w:widowControl':'widowControl',
302
303 'w:b':'bold',
304 'w:i':'italic'}
305 styleDict = {}
306
307
308 styleDefinitions['w:name'] = requirementStyleName
309
310
311
312 styleDefinitions['w:line'] = float(styleDefinitions['w:line']) / float(240.0)
313
314 styleDefinitions['w:sz'] = round(float(styleDefinitions['w:sz']) / 2, 1)
315 styleDefinitions['w:before'] = convertTwipToPt(float(styleDefinitions['w:before']))
316 styleDefinitions['w:after'] = convertTwipToPt(float(styleDefinitions['w:after']))
317
318
319
320
321 styleDefinitions['w:left'] = round(convertTwipToCm(float(styleDefinitions['w:left'])), 1)
322
323
324 for key in translateDict.keys():
325 styleDict[translateDict[key]] = styleDefinitions[key]
326
327 return styleDict
328
330 ''' Returns the style definition of the given style from style.xml and theme1.xml.
331 Recursion used because the style can be based on some other style.
332 In addition, the base style gets style definitions from the document defaults.
333 Finally, some style definitions are not found in the XML file at all. These definitions use some default value which must be assumed.
334
335 @note: XML example:
336
337 <w:style w:type="paragraph" w:default="1" w:styleId="Normaali">
338 <w:name w:val="Normal"/>
339 <w:qFormat/>
340 <w:rsid w:val="006B493C"/>
341 <w:pPr>
342 <w:spacing w:before="140" w:after="220" w:line="360" w:lineRule="auto"/>
343 <w:ind w:left="567"/>
344 <w:jc w:val="both"/>
345 </w:pPr>
346 <w:rPr>
347 <w:rFonts w:ascii="Georgia" w:hAnsi="Georgia"/>
348 <w:lang w:val="fi-FI"/>
349 </w:rPr>
350 </w:style>
351
352 @param styleXml: styles.xml-file as a DOM tree.
353 @param styleName: The name of the style (NOT the id)
354 @see: _getStyleElementById and _getStyleElementByName for difference.
355 @param themeXml: theme1.xml as DOM tree.
356
357 @return: Style definitions in a dict.
358 '''
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433 styleDefinitions = {'w:name': None,
434 'w:basedOn': None,
435 'w:next': None,
436 'w:link': None,
437 'w:autoRedefine': None,
438 'w:left': '0',
439 'w:right': '0',
440 'w:firstLine': None,
441 'w:line': '240.0',
442 'w:before': '0',
443 'w:after': '0',
444 'w:widowControl': True,
445 'w:jc': 'left',
446 'w:sz': '24',
447 'w:ascii': None,
448 'w:asciiTheme': None,
449 'w:b': False,
450 'w:i': False,
451 'w:u': False,
452
453
454
455 'w:lang': None,
456 'w:keepNext': None,
457 'w:keepLines': None,
458 'w:pageBreakBefore': None,
459 'w:caps': None}
460
461 styleElement = _getStyleElementByName(styleName, styleXml)
462
463 if styleElement is None:
464 return
465
466 basedOnStyleId = _getBasedOnStyleId(styleName, styleXml)
467
468
469
470 if basedOnStyleId is not None:
471 styleDefinitions = _getCompleteStyleDefinitions(styleXml, _getStyleNameById(basedOnStyleId, styleXml), themeXml)
472 elif basedOnStyleId is None:
473 styleDefinitions = _getStyleDefinitions(styleXml.getElementsByTagName('w:docDefaults')[0], styleDefinitions)
474
475 styleDefinitions = _getStyleDefinitions(styleElement, styleDefinitions)
476
477 if(styleDefinitions.has_key('w:asciiTheme')):
478 styleDefinitions = _getThemeFont(themeXml, styleDefinitions, styleDefinitions['w:asciiTheme'])
479
480 return styleDefinitions
481
483 '''Gets the text content of the first element with a certain tag in the given DOM tree.
484
485 @return: The text value of the element, or None if something went wrong.
486 '''
487 try:
488 value = element.getElementsByTagName(elementTagName)[0].firstChild.nodeValue
489 return value
490 except:
491 return None
492
494 '''Gets the first child of an element with the given tag name.
495
496 Returns the element of a the given parent element with the given elementTagName.
497
498 @param element: The element whose children are searched.
499 @param elementTagName: The tag name of the wanted element.
500
501 @return: An element with the right tag name, or None if it wasn't found.
502 '''
503 try:
504 value = element.getElementsByTagName(elementTagName)[0]
505 except IndexError:
506 return None
507 return value
508
510 '''Gets the children of an element with the given tag name.
511
512 Returns the element of the given parent element by the given elementTagName.
513
514 @param element: The element whose children are searched.
515 @param elementTagName: Tag name of the wanted elements.
516
517 @return: The list of elements with the right tag name, or None if none was found.
518 '''
519 elements = element.getElementsByTagName(elementTagName)
520 if len(elements) == 0:
521 return None
522 return elements
523
524
525
526
527
528
529
530
531
532
533
534
536 '''Gets a header reference target xml-file as a DOM tree.'''
537
538 targetFile = getRelsTargetByRId(header.getAttribute('r:id'), document['word/_rels/document.xml.rels'])
539 targetFileXml = document['word/' + targetFile]
540
541 return targetFileXml
542
544 '''Goes through header or footer references and checks if there is any content in them.
545
546 Checks if there are headers or footers in the front page by looking for <w:t> tags.
547 Even if there are references to headers or footers, they might be empty.
548
549 @param references: Header or footer references.
550
551 @return: The header or footer target XML file as a DOM tree, or None if no headers or footers were found.
552 '''
553
554 if references is not None:
555 for header in references:
556 targetFileXml = _getTargetXmlFileByHeader(header, document)
557
558 if _getElementsWithinElement(targetFileXml, 'w:t') is not None:
559 return targetFileXml
560
561
562 -def _checkAutomaticPageNumbering(section, headerReference, footerReference, document, errorIds, numStartKey):
563 '''Checks if a section has an automatic page numbering and gets the numbering format.
564
565 First goes through the section element and checks that the numbering starts at 1.
566 Gets the section numbering of format definition.
567 If it is defined, returns it.
568 If a numbering format is not found in the section properties, it defaults to 'Standard'.
569 If the numbering format is standard, checks the header and footer references for other numbering format definitions.
570 The numbering format in the header or the footer reference is sometimes in <w:instrText> element inside the content of PAGE \* MERGEFORMAT.
571
572 @param section: The section element to be searched for.
573 @param headerReference: The current header of the section element as a DOM tree.
574 @param footerReference: The current footer of the section element as a DOM tree.
575 @param document: The document as a dict of DOM tree pairs.
576 @param errorIds: The dict for appending errors True/False.
577 @param numStartKey: The key for errorIds to append numbering start error.
578
579 @return: The page numbering as a string format, or False if there was no page numbering or the numbering was both in header and footer.
580 '''
581 pgNumTypeElement = _getElementWithinElement(section, 'w:pgNumType')
582 if pgNumTypeElement is None:
583 return False
584
585 startNum = pgNumTypeElement.getAttribute('w:start')
586 if str(startNum) != '1':
587 errorIds[numStartKey] = False
588 else:
589 errorIds[numStartKey] = True
590
591 numFormat = _getPgNumFormat(pgNumTypeElement)
592
593
594
595
596
597 if numFormat == "Standard":
598 headerFormat = None
599 if headerReference is not None:
600
601 elementValue = _getElementValueWithinElement('w:instrText', headerReference)
602 if elementValue is None:
603 headerFormat = None
604 elif elementValue.find('PAGE') and elementValue.find('MERGEFORMAT'):
605 splitted = elementValue.split('\*')
606 if len(splitted) > 2:
607 headerFormat = splitted[1].lower().strip()
608
609 footerFormat = None
610 if footerReference is not None:
611
612 elementValue = _getElementValueWithinElement('w:instrText', headerReference)
613 if elementValue is None:
614 footerFormat = None
615 elif elementValue.find('PAGE') and elementValue.find('MERGEFORMAT'):
616 splitted = elementValue.split('\*')
617 if len(splitted) > 2:
618 footerFormat = splitted[1].lower().strip()
619
620 if headerFormat is not None and footerFormat is not None:
621
622 return False
623 elif headerFormat is not None and headerFormat != "Standard":
624 return headerFormat
625 elif footerFormat is not None and footerFormat != "Standard":
626 return footerFormat
627
628 return numFormat
629
671
672
686
832
834 '''Get paragraph elements of the wanted section.
835 The page breaking section break elements changes section, continuous section brake elements don't change section.
836
837 The first list of the section elements is the cover section.
838 The second list of the section elements is the table of contents-section.
839 The third list of the section elements is the text section.
840 The document has to have at least 3 sections.
841
842 @param docXml: The document.xml file as a DOM tree.
843 @param sectionName: The wanted section can be 'cover', 'toc' or 'text'.
844
845 @return: The list of the section elements.
846 '''
847
848 sectionList = [[]]
849
850 bodyElement = docXml.getElementsByTagName('w:body')[0]
851
852 i = 0
853
854 for textP in bodyElement.childNodes:
855 sectionList[i].append(textP)
856 sectPrs = textP.getElementsByTagName('w:sectPr')
857 if len(sectPrs) != 0:
858 typeElement = _getElementWithinElement(sectPrs[0], 'w:type')
859 if typeElement is not None:
860 if typeElement.getAttribute('w:val') == 'continuous':
861 continue
862 else:
863 i += 1
864 sectionList.append([])
865
866 sectionElements = {'cover':sectionList[0], 'toc':sectionList[1], 'text':sectionList[2]}
867 if sectionElements.has_key(sectionName):
868 return sectionElements[sectionName]
869
871 '''Gets all the w:sectPr elements of a document or optionally the w:sectPr elements of a specific section.
872
873 w:sectPr elements are stored in a two dimensional list.
874 Continuous section breaks are appended to current outer list index.
875 The page breaking section raises the outer list index.
876
877 @param index: The index of the outer pageSections list that is get. None by default.
878
879 @return: The two dimensional list of all w:sectPr elements if index is None. Otherwise returns the list at the given index.
880 '''
881 sectionElements = _getElementsWithinElement(docXml, "w:sectPr")
882
883 pageSections = [[]]
884
885 i = 0
886
887 for section in sectionElements:
888 typeElement = _getElementWithinElement(section, 'w:type')
889 if typeElement is not None:
890 if typeElement.getAttribute('w:val') == 'continuous':
891 pageSections[i].append(section)
892 else:
893 pageSections[i].append(section)
894 pageSections.append([])
895 i += 1
896
897 if len(pageSections[len(pageSections) - 1]) == 0:
898 pageSections.remove([])
899
900 if index is None:
901 return pageSections
902 else:
903 return pageSections[index]
904
906 '''Goes through two lists of paragraph elements checking if the same paragraph is in both lists.
907
908 @param outerParagraphElements: The outer paragraphlist to be searched for.
909 @param innerParagraphElements: The inner pagraphlist to be searched for.
910 @param errorList: The list for appending error messages.
911 @param errorMsg: The error message to be appended.
912 @param expectedResult: Boolean of the expected result.
913
914 @return: expectedResult changed or unchanged.
915 '''
916 found = False
917 for coverElement in outerParagraphElements:
918 if found is True:
919 break
920 for element in innerParagraphElements:
921 if coverElement.isSameNode(element):
922 expectedResult = not expectedResult
923 errorList.append(errorMsg)
924 found = True
925 break
926
927 return expectedResult
928
930 '''Goes through the section elements in the document checking that the sections are done properly.
931
932 There must be at least three sections in the document.
933 The cover page and the table of the contents cannot be in the same section.
934 Also checks that the Microsoft Office Word setting "Different first page" is off.
935
936 @return: True if everything went well, False if something went terribly wrong or
937 error list if an error was found and the checking could be completed.
938 '''
939 docXml = document['word/document.xml']
940 styleXml = document['word/styles.xml']
941 cover = True
942 toc = False
943
944 allSectionProperties = getSectionElementsBySections(docXml)
945
946 if len(allSectionProperties) < 3:
947 return False
948
949 for section in allSectionProperties:
950 for sectPr in section:
951 if len(sectPr.getElementsByTagName('w:titlePg')) > 0:
952 errorList.append('titlePg')
953 return False
954
955 tocParagraphs = _getParagraphElementsBySequentialStyleName('toc', styleXml, docXml)
956 coverSectionParagraphs = getParagraphElementsBySections(docXml, 'cover')
957 tocSectionParagraphs = getParagraphElementsBySections(docXml, 'toc')
958
959 cover = _areSectionsOverlapping(coverSectionParagraphs, tocParagraphs, errorList, "cover", cover)
960
961 toc = _areSectionsOverlapping(tocSectionParagraphs, tocParagraphs, errorList, "toc", toc)
962
963
964 if toc is True and cover is True:
965 return True
966 else:
967 return errorList
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991 -def _checkPageProperties(allSectionProperties, pageProperties, tagName):
992 ''' Goes through all section properties to see that they have coherent property values.
993
994 If the property value is the same in all section elements, the value is stored in pageProperties.
995 If something is different between the sections, it's wrong and the page property is set False.
996 For example, if two different section elements have different page top marginal, the property is set False.
997
998 @param allSectionProperties: All w:sectPr elements of the document.
999 @param pageProperties: the allowed page properties are {'top': None, 'right': None, 'bottom': None, 'left': None} or {'w': None, 'h': None}.
1000 @param tagName: Tag name of the element whose properties are checked.
1001
1002 @return: pageProperties dict with coherent page values and incoherent values set as False.
1003 '''
1004
1005 for element in allSectionProperties:
1006 for key in pageProperties.keys():
1007 size = _getElementWithinElement(element, tagName).getAttribute('w:' + key)
1008 if pageProperties[key] is None:
1009 pageProperties[key] = size
1010 elif pageProperties[key] != size:
1011 pageProperties[key] = False
1012
1013 return pageProperties
1014
1016 for key in sizesDict.keys():
1017 value = sizesDict[key]
1018 value = convertTwipToCm(int(value))
1019 roundedValue = round(value, 1)
1020 sizesDict[key] = roundedValue
1021
1022 return sizesDict
1023
1024 -def getPageMarginals(document):
1025 '''Gets the document page marginals sizes.
1026
1027 @return: False if the marginals are not coherent, otherwise a dictionary containing the marginal sizes.
1028 '''
1029 allSectionProperties = _getElementsWithinElement(document['word/document.xml'], 'w:sectPr')
1030
1031 pageMarginals = {'top': None, 'right': None, 'bottom': None, 'left': None}
1032
1033
1034
1035
1036 _checkPageProperties(allSectionProperties, pageMarginals, 'w:pgMar')
1037
1038 for key in pageMarginals.keys():
1039 if pageMarginals[key] == False:
1040 return False
1041
1042 return _convertSizes(pageMarginals)
1043
1044 -def getPageSize(document):
1045 '''Gets the document page sizes.
1046
1047 @return: False if the page sizes are not coherent, otherwise a dictionary containing the page width and length.
1048 '''
1049 allSectionProperties = _getElementsWithinElement(document['word/document.xml'], 'w:sectPr')
1050
1051 pageSize = {'w': None, 'h': None}
1052
1053 _checkPageProperties(allSectionProperties, pageSize, 'w:pgSz')
1054
1055 finalPageSize = {'width': pageSize['w'], 'height':pageSize['h']}
1056
1057 for key in finalPageSize.keys():
1058 if finalPageSize[key] == False:
1059 return False
1060
1061 return _convertSizes(finalPageSize)
1062
1063
1067
1069 '''Gets the document creator as set in document setting, None if not found.'''
1070
1071 return _getElementValueWithinElement('dc:creator', coreXml)
1072
1074 '''Gets the document last modifier as set in document setting, None if not found.'''
1075 return _getElementValueWithinElement('cp:lastModifiedBy', coreXml)
1076
1078 '''Gets the document creatin date as found in document setting, None if not found.'''
1079 return _getElementValueWithinElement('dcterms:created', coreXml)
1080
1082 '''Gets the document last modified date as found in document setting, None if not found.'''
1083 return _getElementValueWithinElement('dcterms:modified', coreXml)
1084
1086 '''Gets revision of the document as found in document setting, None if not found.'''
1087 return _getElementValueWithinElement('cp:revision', coreXml)
1088
1089
1091 '''Gets the text content of <w:t>-elements from the given (paragraph) element.
1092
1093 @return: the text content as a string.
1094 '''
1095 eventualText = ''
1096
1097
1098 textElements = _getElementsWithinElement(paragraph, 'w:t')
1099 if textElements is not None:
1100 for i in textElements:
1101 eventualText += i.firstChild.nodeValue
1102 return eventualText
1103
1104 -def checkTocContent(document):
1105 '''Checks if all of the headings created in the document are listed in the table of contents.
1106
1107 @return: True if toc matches the headings content, False otherwise.
1108 '''
1109
1110 docXml = document['word/document.xml']
1111 styleXml = document['word/styles.xml']
1112
1113 headingParagraphs = _getParagraphElementsBySequentialStyleName("heading", styleXml , docXml)
1114
1115
1116 docTocStyles = _getParagraphElementsBySequentialStyleName('toc', styleXml, docXml)
1117
1118
1119 docHeadings = []
1120 for heading in headingParagraphs:
1121 headingText = _getTextFromParagraph(heading).strip()
1122 if headingText != "":
1123 docHeadings.append(headingText)
1124
1125
1126 tocHeadings = []
1127 for tocStyle in docTocStyles:
1128 tocHeadings.append(_getTextFromParagraph(tocStyle).strip())
1129
1130 docHeadingsLength = len(docHeadings)
1131 tocHeadingsLength = len(tocHeadings)
1132 listLength = len(docHeadings)
1133
1134 if docHeadingsLength != tocHeadingsLength:
1135 return False
1136
1137
1138
1139
1140
1141 i = 0
1142 while i < listLength:
1143
1144 if tocHeadings[i].find(docHeadings[i]) == -1:
1145 return False
1146 i += 1
1147
1148 bookmarkStarts = _getElementsWithinElement(docXml, 'w:bookmarkStart')
1149 instrElements = _getElementsWithinElement(docXml, 'w:instrText')
1150 for element in docTocStyles:
1151 instrElements = _getElementsWithinElement(element, 'w:instrText')
1152 for instrElement in instrElements:
1153 instrElementValue = instrElement.firstChild.nodeValue
1154 if instrElementValue.find('PAGEREF') != -1:
1155 for bookmark in bookmarkStarts:
1156 bookmarkNameValue = bookmark.getAttribute('w:name')
1157 if bookmarkNameValue.find(instrElementValue) != -1:
1158 return False
1159
1160 return True
1161
1163 ''' Check if table of contents is done correctly. It has to have a page break before (and after) it.
1164
1165 @see: checkTocContent -- calls the method if there's a table of contents to be found.
1166
1167 @note: XML example:
1168
1169 <w:p w:rsidR="004A16ED" w:rsidRDefault="004A16ED" w:rsidP="006158B0">
1170
1171 <w:pPr>
1172
1173
1174 <w:pStyle w:val="Otsikko"/>
1175
1176 </w:pPr>
1177
1178 <w:r w:rsidRPr="006158B0">
1179
1180 <w:lastRenderedPageBreak/>
1181
1182 <w:t>SISALLYSLUETTELO</w:t>
1183
1184 </w:r>
1185
1186 </w:p>
1187
1188 <w:p w:rsidR="002274FC" w:rsidRDefault="00FA6E61">
1189
1190 <w:pPr>
1191
1192 <w:pStyle w:val="Sisluet1"/>
1193
1194 @return: True if toc is made correctly, False otherwise.
1195 '''
1196
1197 styleId = _getStyleIdByName("toc 1", document['word/styles.xml'])
1198
1199 pStyles = _getElementsWithinElement(document['word/document.xml'], 'w:pStyle')
1200
1201 if pStyles is None:
1202 return False
1203
1204 for style in pStyles:
1205 if (style.getAttribute('w:val') == styleId):
1206 return True
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220 return False
1221
1222
1223
1224
1225
1226 -def checkCoverPage(document):
1227 ''' Checks if the front page is done correctly
1228
1229 @return: coverPageText dictionary containing True or False values.
1230 '''
1231
1232
1233
1234 coverPageText = { 'email': False,
1235 'name': False,
1236 'title': False }
1237
1238 docXml = document['word/document.xml']
1239 coreXml = document['docProps/core.xml']
1240
1241 paragraphs = _getElementsWithinElement(docXml, 'w:p')
1242 allSectionProperties = getSectionElementsBySections(docXml, 0)
1243
1244 firstPageText = ''
1245 lastParagraphOfFirstPage = allSectionProperties[-1].parentNode.parentNode
1246
1247
1248
1249
1250
1251 for element in paragraphs:
1252 firstPageText += getTextContent(element)
1253 if element.isSameNode(lastParagraphOfFirstPage):
1254 break
1255
1256
1257 if firstPageText.find('@') != -1:
1258 coverPageText['email'] = True
1259 if firstPageText.find(_getLastModifier(coreXml)) != -1:
1260 coverPageText['name'] = True
1261 title = _getTitle(coreXml)
1262 if title is not None:
1263 if firstPageText.find(title) != -1:
1264 coverPageText['title'] = True
1265
1266 return coverPageText
1267
1269 '''Returns the value of Target attribute of a Relationship element with the given id in a given rels file.
1270 The value of Target attribute can be for example a relative path to local XML files or images. It can also be a hyperlink.
1271
1272 @param rId: Id attribute value of a Relationship element.
1273 @param rels: rels file as a DOM tree.
1274
1275 @return: The value of Target attribute if found.
1276 '''
1277 for relationship in rels.getElementsByTagName('Relationship'):
1278 if rId == relationship.getAttribute('Id'):
1279 return relationship.getAttribute('Target')
1280
1282 ''' Returns the parent <w:p>-element of a given element if there is one.
1283
1284 @param element: The element whose parent <w:p> element is searched for.
1285 @param tag: The parent tagname, defaults to 'w:p'.
1286
1287 @return: The parent element, or None if no parent is found.
1288 '''
1289
1290 parent = element.parentNode
1291
1292 while parent is not None:
1293 try:
1294 if parent.tagName == tag:
1295 return parent
1296 else:
1297 parent = parent.parentNode
1298 except AttributeError:
1299 return None
1300 return None
1301
1303 ''' Check if there is an image in the document.
1304
1305 @return: True if even one image is found, False otherwise.
1306 '''
1307
1308
1309
1310 picElements = document['word/document.xml'].getElementsByTagName('pic:pic')
1311 pictElements = document['word/document.xml'].getElementsByTagName('w:pict')
1312
1313 if len(picElements) > 0:
1314 return True
1315 if len(pictElements) > 0:
1316 return True
1317 return False
1318
1320 ''' Gets the image paths or the file names of the images used in the document.
1321
1322 @return: The image targets as strings in a list.
1323 '''
1324 targets = []
1325
1326 picElements = document['word/document.xml'].getElementsByTagName('pic:pic')
1327 picElements += document['word/document.xml'].getElementsByTagName('w:pict')
1328
1329 for picElement in picElements:
1330 picRId = picElement.getElementsByTagName('a:blip')[0].getAttribute('r:embed')
1331 targets.append(getRelsTargetByRId(picRId, document['word/_rels/document.xml.rels']))
1332 return targets
1333
1335 '''Checks if the next paragraph after a picture paragraph uses the caption style.
1336
1337 Also checks that the caption contains an automatic field.
1338 Goes through all picture paragraphs.
1339
1340 @return: True if all images have captions, False otherwise.
1341 '''
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362 styleXml = document['word/styles.xml']
1363
1364 picParagraphs = []
1365 picElements = document['word/document.xml'].getElementsByTagName('pic:pic')
1366 picElements += document['word/document.xml'].getElementsByTagName('w:pict')
1367
1368 for pic in picElements:
1369 picParagraphs.append(getParentParagraph(pic))
1370
1371 if picParagraphs is None:
1372 return False
1373
1374 for p in picParagraphs:
1375 try:
1376 captionParagraph = p.nextSibling
1377 captionParagraphPpr = captionParagraph.getElementsByTagName('w:pPr')[0]
1378 captionParagraphStyleID = captionParagraphPpr.getElementsByTagName('w:pStyle')[0].getAttribute('w:val')
1379 captionParagraphStyleName = _getStyleNameById(captionParagraphStyleID, styleXml)
1380 if captionParagraphStyleName != "caption":
1381 return False
1382 except:
1383 return False
1384
1385
1386 if getAttributeContent(captionParagraph).find('SEQ') == -1 and \
1387 getTextContent(captionParagraph).find('SEQ') == -1:
1388 return False
1389
1390 return True
1391
1393 '''Checks that text paragraphs are using styles and that no manual style definitions are made.
1394
1395 Goes through all paragraph-elements in a document looking for <w:pStyle>-elements.
1396 Gets the style definitions to see if there are manual changes.
1397
1398 @note: Exception:
1399
1400 Automatically generated table on contents can contain "manual" style definitions.
1401 The <w:sectPr> elements within paragraph elements are skipped also.
1402
1403 @param errorIdsAndPositions: A dict for error strings. Should contain keys 'manualChanges' and 'styleNotUsed'.
1404
1405 @return: True if nothing was found, False if even one error was found.
1406 '''
1407
1408 paragraphs = document['word/document.xml'].getElementsByTagName('w:p')
1409 styleXml = document['word/styles.xml']
1410
1411 for p in paragraphs:
1412 styleDefinitions = {'w:autoRedefine': None,
1413 'w:left': None,
1414 'w:right': None,
1415 'w:firstLine': None,
1416 'w:line': None,
1417 'w:before': None,
1418 'w:after': None,
1419 'w:widowControl': None,
1420 'w:jc': None,
1421 'w:sz': None,
1422 'w:ascii': None,
1423 'w:asciiTheme': None,
1424 'w:b': None,
1425 'w:i': None,
1426 'w:u': None,
1427
1428
1429
1430
1431 'w:keepNext': None,
1432 'w:keepLines': None,
1433 'w:pageBreakBefore': None}
1434
1435 try:
1436 style = p.getElementsByTagName('w:pStyle')[0].getAttribute('w:val')
1437 style = _getStyleNameById(style, styleXml)
1438
1439
1440 if style.startswith('toc'):
1441 continue
1442
1443
1444
1445
1446
1447
1448 except:
1449
1450 pContent = _getTextFromParagraph(p)
1451 if pContent.strip() != "":
1452
1453 errorIdsAndPositions['styleNotUsed'].append(pContent[:30])
1454
1455 try:
1456 for paragraphProperties in p.getElementsByTagName('w:pPr'):
1457 for propertyElement in paragraphProperties.childNodes:
1458
1459 if propertyElement.tagName != "w:sectPr":
1460 styleDefinitions = _getStyleDefinitions(propertyElement, styleDefinitions)
1461 for runProperties in p.getElementsByTagName('w:rPr'):
1462 styleDefinitions = _getStyleDefinitions(runProperties, styleDefinitions)
1463
1464 for key in styleDefinitions.keys():
1465 if styleDefinitions[key] is not None:
1466 pContent = _getTextFromParagraph(p)
1467 if pContent.strip() != "":
1468 errorIdsAndPositions['manualChanges'].append(pContent[:50])
1469 break
1470 except:
1471 continue
1472
1473 for key in errorIdsAndPositions.keys():
1474 if len(errorIdsAndPositions[key]) > 0:
1475 return False
1476 return True
1477
1497
1499 ''' Goes through images' captions looking for a reference. Then checks if the caption is referenced somewhere.
1500
1501 @return: True if a cross reference is found, False otherwise.
1502 '''
1503
1504
1505 docXml = document['word/document.xml']
1506
1507 picParagraphs = []
1508 picElements = document['word/document.xml'].getElementsByTagName('pic:pic')
1509 picElements += document['word/document.xml'].getElementsByTagName('w:pict')
1510
1511 for pic in picElements:
1512 picParagraphs.append(getParentParagraph(pic))
1513
1514 for p in picParagraphs:
1515 captionParagraph = p.nextSibling
1516 try:
1517 bookmarkStartElement = captionParagraph.getElementsByTagName('w:bookmarkStart')[0]
1518 except:
1519
1520 return False
1521 reference = bookmarkStartElement.getAttribute('w:name')
1522
1523 for element in docXml.getElementsByTagName('w:instrText'):
1524 elementText = getTextContent(element)
1525 if elementText.find(reference) != -1:
1526 return True
1527
1528 return False
1529
1531 '''Gets an element by an attribute value.
1532
1533 @param nodeList: A list of elements to be searched for.
1534 @param attributeName: The name of the wanted attribute.
1535 @param attributeValue: The wanted value of the attribute.
1536
1537 @return: The element, if it has an attribute with the wanted value, None otherwise.
1538 '''
1539 for element in nodeList:
1540 if element.getAttribute(attributeName) == attributeValue:
1541 return element
1542 return None
1543
1545 '''Checks that a style is used in the document.
1546
1547 @param styleName: The name of the style looked for.
1548
1549 @return: True if the style is used, False otherwise.
1550 '''
1551 docXml = document['word/document.xml']
1552 styleXml = document['word/styles.xml']
1553
1554 styleId = _getStyleIdByName(styleName, styleXml)
1555 bodyParagraphs = _getParagraphElementsByStyleId(docXml, styleId)
1556
1557 if len(bodyParagraphs) > 0:
1558 return True
1559 return False
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1587 '''Checks the headings in the document.
1588
1589 Goes through the heading styles used in the document checking that they use a multilevel numbering,
1590 the numbering is done correctly using styles and that the numbering is connected to other heading styles.
1591
1592 Gets all the heading styles used in the document.
1593 Searches for the heading's numbering definition reference in styles.xml.
1594 Next searches the associated numbering definition in numbering.xml.
1595 Next searches the correct numbering level definition associated to the heading.
1596 Checks that the numbering is multilevel and done correctly using the heading styles.
1597
1598 @note: XML example:
1599
1600 styles.xml:
1601
1602 <w:style w:type="paragraph" w:styleId="Heading2"> - Heading 2 style definition
1603
1604 <w:name w:val="heading 2"/>
1605
1606 <w:pPr>
1607
1608 <w:numPr>
1609
1610 <w:ilvl w:val="1"/> - Numbering Level Reference
1611
1612 <w:numId w:val="1"/> - Numbering Definition Instance Reference
1613
1614 </w:numPr>
1615
1616 <w:outlineLvl w:val="1"/>
1617
1618 </w:pPr>
1619
1620 </w:style>
1621
1622 numbering.xml:
1623
1624 <w:abstractNum w:abstractNumId="0"> - Abstract Numbering Definition
1625
1626 <w:multiLevelType w:val="multilevel"/> - Abstract Numbering Definition Type
1627
1628 <w:lvl w:ilvl="0"> - </w:lvl> - Numbering Level Definition
1629
1630 <w:lvl w:ilvl="1"> - Numbering Level Definition
1631
1632 <w:start w:val="1"/> - Starting Value
1633
1634 <w:numFmt w:val="decimal"/> - Numbering Format
1635
1636 <w:pStyle w:val="Heading2"/> - Paragraph Style's Associated Numbering Level
1637
1638 <w:lvlText w:val="%1.%2"/> - Numbering Level Text
1639
1640 <w:lvlJc w:val="left"/> - Justification
1641
1642 <w:pPr> - Numbering Level Associated Paragraph Properties
1643
1644 <w:ind w:left="576" w:hanging="576"/>
1645
1646 </w:pPr>
1647
1648 </w:lvl>
1649
1650 </w:abstractNum>
1651
1652 <w:num w:numId="1"> - Numbering Definition Instance
1653
1654 <w:abstractNumId w:val="0"/> - Abstract Numbering Definition Reference
1655
1656 </w:num>
1657
1658 @param errorIdsAndPositions: A dict for appending errors in key - stringlist pairs.
1659 Should contain the following keys:
1660 - 'manualNumbering' -- numbering is done manually somehow.
1661 - 'styleNotUsed' -- an expected heading style is not used.
1662 - 'differentNumbering' -- some heading style is using different numbering than some other heading styles.
1663 - 'notMultilevel' -- the numbering is not multilevel.
1664 - 'outlineLvl' -- the outline of a heading style is not correct.
1665 - 'numStart' -- the numbering doesn't start at 1.
1666 - 'numWrong' -- the numbering is somehow not done with styles.
1667 - 'numFormat' -- the numbering format is not correct.
1668 - 'notSequential' -- heading styles are not used correctly in a row for example heading 3 is used after heading 1.
1669 '''
1670
1671
1672 docXml = document["word/document.xml"]
1673 styleXml = document["word/styles.xml"]
1674
1675 try:
1676 numXml = document['word/numbering.xml']
1677
1678
1679 except:
1680
1681 return False
1682
1683
1684 headingParagraphs = _getParagraphElementsBySequentialStyleName("heading", styleXml, docXml)
1685
1686
1687
1688
1689
1690 usedHeadingsStyleIds = []
1691 previousHeadingLevel = 0
1692 for heading in headingParagraphs:
1693 styleId = heading.getElementsByTagName('w:pStyle')[0].getAttribute('w:val')
1694 if len(heading.getElementsByTagName('w:ilvl')) > 0 or \
1695 len(heading.getElementsByTagName('w:numId')) > 0:
1696 errorIdsAndPositions['manualNumbering'] = getTextContent(heading)
1697
1698 if usedHeadingsStyleIds.count(styleId) == 0:
1699 usedHeadingsStyleIds.append(str(styleId))
1700 headingLevel = int(styleId[len(styleId) - 1])
1701 if fabs(headingLevel - previousHeadingLevel) > 1:
1702
1703 errorIdsAndPositions['notSequential'] = getTextContent(heading)
1704 previousHeadingLevel = headingLevel
1705
1706
1707 usedHeadingsStyleIds.sort(cmp=None, key=None, reverse=False)
1708
1709
1710 previousNumId = None
1711
1712 for headingStyleId in usedHeadingsStyleIds:
1713
1714 headingLevel = int(headingStyleId[len(usedHeadingsStyleIds[0]) - 1])
1715
1716 headingStyleElement = _getStyleElementById(headingStyleId, styleXml)
1717
1718
1719
1720 styleDefinitions = {'w:ilvl': '0', 'w:numId': None, 'w:outlineLvl': None}
1721 styleDefinitions = _getStyleDefinitions(headingStyleElement, styleDefinitions)
1722
1723
1724 if styleDefinitions['w:numId'] is None:
1725 errorIdsAndPositions['styleNotUsed'] = headingStyleId
1726
1727
1728 if previousNumId is not None and styleDefinitions['w:numId'] != previousNumId:
1729 errorIdsAndPositions['differentNumbering'] = headingStyleId
1730
1731 previousNumId = styleDefinitions['w:numId']
1732 if int(styleDefinitions['w:ilvl']) != headingLevel - 1:
1733 errorIdsAndPositions['notMultilevel'] = headingStyleId
1734
1735 if int(styleDefinitions['w:outlineLvl']) != headingLevel - 1:
1736 errorIdsAndPositions['outlineLvl'] = headingStyleId
1737
1738
1739
1740
1741
1742
1743 try:
1744 numElement = _getElementByAttributeValue(numXml.getElementsByTagName('w:num'), 'w:numId', styleDefinitions['w:numId'])
1745 abstractNumId = numElement.getElementsByTagName('w:abstractNumId')[0].getAttribute('w:val')
1746 absNumElement = _getElementByAttributeValue(numXml.getElementsByTagName('w:abstractNum'), 'w:abstractNumId', abstractNumId)
1747 lvlElement = _getElementByAttributeValue(absNumElement.getElementsByTagName('w:lvl'), 'w:ilvl', styleDefinitions['w:ilvl'])
1748 except:
1749
1750 continue
1751
1752
1753 numDefinitions = {'w:start': None, 'w:numFmt': None, 'w:pStyle': None, 'w:lvlText': None, 'w:lvlJc': None, 'w:tentative': None}
1754 numDefinitions = _getStyleDefinitions(lvlElement, numDefinitions)
1755
1756
1757 if numDefinitions['w:start'] != '1':
1758 errorIdsAndPositions['numStart'] = headingStyleId
1759
1760 if numDefinitions['w:pStyle'] != headingStyleId:
1761 errorIdsAndPositions['numWrong'] = headingStyleId
1762
1763 if numDefinitions['w:numFmt'] != "decimal":
1764 errorIdsAndPositions['numFormat'] = headingStyleId
1765
1766
1767 return True
1768
1770 ''' Gets all paragraph-elements in the document by a style id.'''
1771 paragraphList = []
1772
1773 for p in docXml.getElementsByTagName('w:p'):
1774 try:
1775 if styleId == p.getElementsByTagName('w:pStyle')[0].getAttribute('w:val'):
1776 paragraphList.append(p)
1777 except:
1778 continue
1779
1780 return paragraphList
1781
1783 ''' Return all paragraph elements that use a style name with a sequential numbering.
1784
1785 Gets all paragraphs that use styles with stylenames for example heading 1, heading 2, etc or
1786 index 1, index 2, etc.
1787
1788 @param styleNamePrefix: The prefix of the sequential style name.
1789 '''
1790 paragraphs = []
1791 i = 1
1792 styleNamePrefix = styleNamePrefix.strip() + " "
1793
1794 while(True):
1795 styleId = _getStyleIdByName(styleNamePrefix + str(i), styleXml)
1796 if styleId is None:
1797 break
1798 else:
1799 paragraphs += _getParagraphElementsByStyleId(docXml, styleId)
1800 i += 1
1801 return paragraphs
1802
1804 '''Checks that the document has an automatically made index.
1805
1806 @return: False if an index is missing, '2' if index is not automatically made and True if everything was OK.
1807 '''
1808 docXml = document['word/document.xml']
1809 styleXml = document['word/styles.xml']
1810
1811 indexParagraphs = _getParagraphElementsBySequentialStyleName("index ", styleXml, docXml)
1812 if len(indexParagraphs) == 0:
1813 return False
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833 try:
1834 indexFieldCodeElement = indexParagraphs[0].previousSibling.getElementsByTagName('w:instrText')[0]
1835 except:
1836 indexFieldCodeElement = None
1837
1838 if indexFieldCodeElement is None:
1839 try:
1840 indexFieldCodeElement = indexParagraphs[0].previousSibling.previousSibling.getElementsByTagName('w:instrText')[0]
1841 except:
1842 indexFieldCodeElement = None
1843
1844 if indexFieldCodeElement is None:
1845
1846 return '2'
1847 elif getTextContent(indexFieldCodeElement).find('INDEX') == -1:
1848
1849 return '2'
1850
1851 return True
1852
1853 -def checkIndexContent(document):
1854 ''' Checks that the document has a index that is not empty, and that the index entries are referenced somewhere in the document.
1855
1856 First gets all the index styles' definitions from styles.xml and finds paragraphs using the styles in the document.xml.
1857 Checks that there is a field code element indicating that the index is generated automatically.
1858 Collects the content of the index and checks it isn't empty.
1859 Finds references to the index entries and matches them to the index content.
1860
1861 @note: XML example:
1862
1863 Index example:
1864
1865 <w:p w:rsidR="002F2A09" w:rsidRDefault="00CA51D5">
1866
1867 <w:r>
1868
1869 <w:fldChar w:fldCharType="begin"/>
1870
1871 </w:r>
1872
1873 <w:r>
1874
1875 <w:instrText xml:space="preserve"> INDEX \c "2" \z "1035" </w:instrText>
1876
1877 </w:r>
1878
1879 <w:r>
1880
1881 <w:fldChar w:fldCharType="separate"/>
1882
1883 </w:r>
1884
1885 </w:p>
1886
1887 <w:p w:rsidR="002F2A09" w:rsidRDefault="002F2A09">
1888
1889 <w:pPr>
1890
1891 <w:pStyle w:val="Index1"/>
1892
1893 <w:tabs>
1894
1895 <w:tab w:val="right" w:leader="dot" w:pos="3950"/>
1896
1897 </w:tabs>
1898
1899 </w:pPr>
1900
1901 <w:r>
1902
1903 <w:t>Index entry level 1</w:t>
1904
1905 </w:r>
1906
1907 </w:p>
1908
1909 Reference example:
1910
1911 <w:r w:rsidR="00B27B47">
1912
1913 <w:instrText xml:space="preserve"> XE "</w:instrText>
1914
1915 </w:r>
1916
1917 <w:r w:rsidR="00B27B47" w:rsidRPr="00B27B47">
1918
1919 <w:instrText>Level 1 entry</w:instrText>
1920
1921 </w:r>
1922
1923 <w:r w:rsidR="00B27B47" w:rsidRPr="00B27B47">
1924
1925 <w:instrText>:</w:instrText>
1926
1927 </w:r>
1928
1929 <w:r w:rsidR="00B27B47" w:rsidRPr="0011587C">
1930
1931 <w:instrText>Level 2 entry</w:instrText>
1932
1933 </w:r>
1934
1935 @return: '3' if the index is empty, '4' if the content does not match with the document and True if everything went OK.
1936 '''
1937 docXml = document['word/document.xml']
1938 styleXml = document['word/styles.xml']
1939
1940 indexParagraphs = _getParagraphElementsBySequentialStyleName("index ", styleXml, docXml)
1941 indexTextContent = dict()
1942 for p in indexParagraphs:
1943 textContent = getTextContent(p)
1944 if textContent is not None and textContent != "":
1945 indexTextContent[textContent] = None
1946 if len(indexTextContent) == 0:
1947
1948 return '3'
1949
1950 documentFieldTexts = ""
1951 for pElement in docXml.getElementsByTagName('w:instrText'):
1952 documentFieldTexts += getTextContent(pElement)
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962 indexReferenceFieldsContent = []
1963 for field in documentFieldTexts.split('XE '):
1964 for candidate in field.split('\"'):
1965 for finalCandidate in candidate.split(":"):
1966 if finalCandidate.strip() != "":
1967 indexReferenceFieldsContent.append(finalCandidate)
1968
1969
1970 for indexReferenceComponent in indexReferenceFieldsContent:
1971
1972 for key in indexTextContent.keys():
1973 if key.find(indexReferenceComponent) != -1:
1974 indexTextContent[key] = True
1975 break
1976 elif indexReferenceComponent.find(key) != -1:
1977 indexTextContent[key] = True
1978 break
1979
1980 for key in indexTextContent.keys():
1981
1982 if indexTextContent[key] is None:
1983
1984 return '4'
1985 return True
1986
1988 '''Checks double whitespaces in the document.
1989
1990 @return: The amount of occurrences of the double whitespaces found in the document, False otherwise.
1991 '''
1992 return checkStringFromDocument(document['word/document.xml'], ' ')
1993
1995 '''Checks the *-character in the document.
1996
1997 @return: The amount of occurrences of the asterisks found in the document, False otherwise.
1998 '''
1999 return checkStringFromDocument(document['word/document.xml'], '*')
2000
2002 '''Checks if a string is found in the text content of the document (in the w:t-elements).
2003 If string is found, returns how many occurences were found in a paragraph.
2004
2005 @return: The amount of occurrences of the string is found in the document, False otherwise.
2006 '''
2007 found = False
2008 count = 0
2009 for p in docXml.getElementsByTagName('w:p'):
2010 textContent = ""
2011 for textElement in p.getElementsByTagName('w:t'):
2012 textContent += getTextContent(textElement)
2013 occurrences = textContent.count(string)
2014 if occurrences > 0:
2015 count += occurrences
2016
2017 found = True
2018 if found is True:
2019 return count
2020 return found
2021
2022
2023
2024
2026 '''Checks if the tabulator is used in the document.
2027
2028 @note: Exceptions:
2029
2030 - automatically generated table of contents and index contain tabulators.
2031
2032 - before an automatically generated index there is a paragraph-element with <instrText>-element and a <tab>-element.
2033
2034 @return: The amount of the tabulator occurrences found in the document, False if none was found.
2035 '''
2036
2037 styleXml = document['word/styles.xml']
2038 tabParagraphs = document['word/document.xml'].getElementsByTagName('w:tab')
2039 tabCount = 0
2040
2041 if len(tabParagraphs) == 0:
2042 return False
2043
2044
2045
2046
2047 for tab in tabParagraphs:
2048 tabParent = getParentParagraph(tab, 'w:p')
2049 try:
2050 tabParentStyleId = tabParent.getElementsByTagName('w:pStyle')[0].getAttribute('w:val')
2051 except:
2052 continue
2053 if _getStyleNameById(tabParentStyleId, styleXml).startswith('toc') or \
2054 _getStyleNameById(tabParentStyleId, styleXml).startswith('index'):
2055 continue
2056 else:
2057 if getTextContent(tabParent).find('INDEX') != -1:
2058 continue
2059 else:
2060
2061 tabCount += 1
2062
2063
2064
2065
2066
2067
2068
2069
2070 if tabCount == 0:
2071 return False
2072
2073 return tabCount
2074
2076 '''Checks if a paragraph is empty.
2077
2078 @note: Expections:
2079
2080 Picture in the document produces an empty paragraph.
2081 Empty table cell produces an empty paragraph.
2082 A table produces an empty paragraph right after the table.
2083 Objects and graphics produce an empty paragraph.
2084 ...
2085
2086 @param p: The paragraph element under inspection.
2087
2088 @return: False if the paragraph is not empty, True if it is empty.
2089 '''
2090
2091 pContent = _getTextFromParagraph(p).strip()
2092 if len(pContent) == 0:
2093 if len(p.getElementsByTagName('pic:pic')) > 0:
2094 return False
2095 if len(p.getElementsByTagName('w:sectPr')) > 0:
2096 return False
2097 if len(p.getElementsByTagName('w:pict')) > 0:
2098 return False
2099 if len(p.getElementsByTagName('w:object')) > 0:
2100 return False
2101 if len(p.getElementsByTagName('a:graphic')) > 0:
2102 return False
2103 if getParentParagraph(p, 'w:tbl') is not None:
2104 return False
2105
2106
2107 if p.previousSibling is not None:
2108 if p.previousSibling.tagName == 'w:tbl':
2109 return False
2110 styleId = _getParagraphStyleId(p.previousSibling)
2111 if styleId is not None:
2112 styleName = _getStyleNameById(styleId, styleXml)
2113 if styleName is not None:
2114 if styleName.find('toc') != -1:
2115 return False
2116 if p.previousSibling is not None:
2117 styleId = _getParagraphStyleId(p.previousSibling)
2118 if styleId is not None:
2119 styleName = _getStyleNameById(styleId, styleXml)
2120 if styleName is not None:
2121 if styleName.find('index') != -1:
2122 return False
2123 if p.previousSibling.previousSibling is not None:
2124 styleId = _getParagraphStyleId(p.previousSibling.previousSibling)
2125 if styleId is not None:
2126 styleName = _getStyleNameById(styleId, styleXml)
2127 if styleName is not None:
2128 if styleName.find('index') != -1:
2129 return False
2130
2131
2132
2133
2134 return True
2135 return False
2136
2138 ''' Finds all empty paragraphs in the document.
2139
2140 @note: Expections:
2141
2142 Picture in the document produces an empty paragraph.
2143 Empty table cell produces an empty paragraph.
2144 A table produces an empty paragraph right after the table.
2145 ...?
2146
2147 @return: amount of empty paragraph occurrences in the document, False if none was found.
2148 '''
2149 paragraphs = document['word/document.xml'].getElementsByTagName('w:p')
2150
2151 emptyParagraphsCount = 0
2152
2153 for p in paragraphs:
2154 result = isParagraphEmpty(p, document['word/styles.xml'])
2155
2156 if result is True:
2157 emptyParagraphsCount += 1
2158
2159 if emptyParagraphsCount == 0:
2160 return False
2161 else:
2162 return emptyParagraphsCount
2163
2165 ''' Goes through all paragraph elements in the document looking for paragraphs that use some list style.
2166
2167 @param listName: The list stylename we want to check. Defaults to 'List',
2168 which finds list styles such as 'List', 'List Bullet', 'List Numbered'.
2169
2170 @return: True, if a list style is used in the document, False otherwise.
2171 '''
2172 docXml = document['word/document.xml']
2173 styleXml = document['word/styles.xml']
2174
2175 for p in docXml.getElementsByTagName('w:p'):
2176 styleId = _getParagraphStyleId(p)
2177 if styleId is not None:
2178 styleName = _getStyleNameById(styleId, styleXml)
2179 if styleName.find(listName) != -1:
2180 return True
2181
2182 return False
2183
2185 '''Checks that the document has a chart copied from a spreadsheet document.
2186 The Chart must be pasted as a link.
2187 '''
2188
2189
2190 docXml = document['word/document.xml']
2191 docRelsXml = document['word/_rels/document.xml.rels']
2192
2193 objectElements = docXml.getElementsByTagName('w:object')
2194 if len(objectElements) == 0:
2195 return False
2196
2197 for objectElement in objectElements:
2198 if len(objectElement.getElementsByTagName('v:formulas')) > 0:
2199
2200 try:
2201 OLEObjectElement = objectElement.getElementsByTagName('o:OLEObject')[0]
2202 except:
2203 continue
2204
2205 if OLEObjectElement.getAttribute('ProgID').find('Excel') != -1:
2206
2207
2208 if OLEObjectElement.getAttribute('Type') == 'Link':
2209
2210 rid = OLEObjectElement.getAttribute('r:id')
2211 target = getRelsTargetByRId(rid, docRelsXml)
2212 targetChart = target.split("!")
2213 targetChart.reverse()
2214
2215 targetChartName = targetChart[0]
2216
2217
2218 if targetChartName.count('%') < 3:
2219 return False
2220
2221
2222
2223
2224 return True
2225 else:
2226 return "Spreadsheet object is not pasted as a link."
2227 return False
2228
2230 '''Checks that the document has a table copied from a spreadsheet document.
2231 For now checks that the table is pasted as a link.
2232 '''
2233 docXml = document['word/document.xml']
2234 docRelsXml = document['word/_rels/document.xml.rels']
2235
2236 objectElements = docXml.getElementsByTagName('w:object')
2237 if len(objectElements) == 0:
2238 return False
2239
2240 for objectElement in objectElements:
2241 if len(objectElement.getElementsByTagName('v:formulas')) == 0:
2242
2243 try:
2244 OLEObjectElement = objectElement.getElementsByTagName('o:OLEObject')[0]
2245 except:
2246 continue
2247
2248 if OLEObjectElement.getAttribute('ProgID').find('Excel') != -1:
2249
2250
2251 if OLEObjectElement.getAttribute('Type') == 'Link':
2252 rid = OLEObjectElement.getAttribute('r:id')
2253 target = getRelsTargetByRId(rid, docRelsXml)
2254 targetChart = target.split("!")
2255 targetChart.reverse()
2256 targetTableCells = targetChart[0]
2257
2258
2259
2260 if targetTableCells.count('R') != 2:
2261 return False
2262 if targetTableCells.count('C') != 2:
2263 return False
2264
2265
2266 return True
2267 else:
2268 return "Spreadsheet object is not pasted as a link."
2269 return False
2270
2272 '''Checks that the document contains a chart pasted from PowerPoint as a vector graphics picture or as an object.
2273 Doesn't really know if the picture or object is actually from PowerPoint!
2274 '''
2275
2276
2277
2278 docRelsXml = document['word/_rels/document.xml.rels']
2279
2280 pictureTargets = getImagePaths(docRelsXml)
2281 for target in pictureTargets:
2282 if target.endswith('.emf') is True:
2283
2284 return True
2285 if target.endswith('wmf') is True:
2286 return True
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296 return False
2297