Package src :: Package inspectors :: Module docx_inspector
[hide private]
[frames] | no frames]

Source Code for Module src.inspectors.docx_inspector

   1  #!/usr/bin/python 
   2  # -*- coding: UTF-8 -*- 
   3  # 
   4  #The MIT License 
   5  # 
   6  #Copyright (c) 2011 
   7  # 
   8  #Permission is hereby granted, free of charge, to any person obtaining a copy 
   9  #of this software and associated documentation files (the "Software"), to deal 
  10  #in the Software without restriction, including without limitation the rights 
  11  #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 
  12  #copies of the Software, and to permit persons to whom the Software is 
  13  #furnished to do so, subject to the following conditions: 
  14  # 
  15  #The above copyright notice and this permission notice shall be included in 
  16  #all copies or substantial portions of the Software. 
  17  # 
  18  #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
  19  #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
  20  #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
  21  #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
  22  #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
  23  #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
  24  #THE SOFTWARE. 
  25  # 
  26  #Authors: 
  27  #   Vili Auvinen (vili.k.auvinen@jyu.fi) 
  28  #   Olli Kauppinen (olli.kauppinen@jyu.fi) 
  29  #   Juho Tammela (juho.i.tammela@jyu.fi) 
  30   
  31  ''' 
  32  The module provides the methods for inspecting docx files. 
  33   
  34  @author: Vili Auvinen, Juho Tammela 
  35  ''' 
  36   
  37  from common_methods import * 
  38  from math import fabs 
  39  from conversions import convertTwipToCm, convertTwipToPt 
  40   
41 -def _getStyleElementById(styleId, styleXml):
42 '''Gets a style element by the style id from the styles.xml. 43 44 The style Id links a paragraph using a style in document.xml to the right style in styles.xml. 45 Style id can be in different languages depending on what language the Word was that wrote the document. 46 47 @note: XML example: 48 49 <w:p> 50 <w:pPr> 51 <w:pStyle w:val="Otsikko1"/> (This is the style id.) 52 </w:pPr> 53 <r> 54 ... 55 </r> 56 </w:p> 57 58 59 @param styleId: The style id. 60 @param styleXml: styles.xml as a DOM tree. 61 62 @return: The style element with a given style id or None if no matching style element was found. 63 ''' 64 65 #styleElements = _getElementsWithinElement(styleXml, 'w:style') 66 styleElements = styleXml.getElementsByTagName('w:style') 67 for element in styleElements: 68 if (element.getAttribute('w:styleId') == styleId): 69 return element 70 71 #errors.append(styleId + ' -style id is not found.') 72 return None
73
74 -def _getStyleElementByName(styleName, styleXml):
75 '''Gets a style element by a style name from styles.xml. 76 77 Style name is found in the styles.xml. 78 A style has always the same name regardless of the language of the Word that wrote the document. 79 80 @note: XML example: 81 82 <w:style w:type="paragraph" w:styleId="Otsikko1"> (This is the style id.) 83 <w:name w:val="heading 1"/> (Here is the style name.) 84 ... 85 </w:style> 86 87 @param styleName: The style name. 88 @param styleXml: styles.xml as a DOM tree. 89 90 @return: The style-element with a given style name, or None if no matching style element was found. 91 ''' 92 #nameElements = _getElementsWithinElement(styleXml, 'w:name') # w:name -element is style-element's child 93 nameElements = styleXml.getElementsByTagName('w:name') 94 for element in nameElements: 95 if (element.getAttribute('w:val') == styleName): 96 return element.parentNode 97 return None
98
99 -def _getBasedOnStyleId (styleName, styleXml):
100 '''Get the based-on style style id for a given style from styles.xml. 101 102 @param styleName: The style name of the style that's based-on style id is wanted. 103 @param styleXml: styles.xml as a DOM tree. 104 105 @return: The id of the based on style for a given style name, or None if there was no based on style.''' 106 # FIXME: returns actually the styleId of the basedOn-style NOT the styleName. 107 if _getStyleElementByName(styleName, styleXml) is not None: 108 try: 109 return _getStyleElementByName(styleName, styleXml).getElementsByTagName('w:basedOn')[0].getAttribute('w:val') 110 except: 111 return None 112 return None
113
114 -def _getStyleName(styleElement):
115 '''Get a style name of the style element. 116 117 @param styleElement: The style element whose style name is wanted. 118 119 @return: The style name of a given style element.''' 120 # TODO: try except 121 return styleElement.getElementsByTagName('w:name')[0].getAttribute('w:val')
122
123 -def _getStyleNameById(styleId, styleXml):
124 '''Get the name of a style with a given style id. 125 126 @param styleId: The style id to be looked for. 127 @param styleXml: styles.xml as a DOM tree. 128 129 @return: The style name of the style with the correct style id, or None if it wasn't found. 130 ''' 131 styleElement = _getStyleElementById(styleId, styleXml) 132 if styleElement is not None: 133 return _getStyleName(styleElement) 134 return None
135
136 -def _getStyleIdByName(styleName, styleXml):
137 '''Get the id of a style with a given style name from styles.xml. 138 139 @param styleName: The style name to be looked for. 140 @param styleXml: styles.xml as a DOM tree. 141 142 @return: The style id of the style with the correct style name, or None if it wasn't found. 143 ''' 144 #styleElements = _getElementsWithinElement('w:name', styleXml) 145 styleElements = styleXml.getElementsByTagName('w:name') 146 for styleElement in styleElements: 147 if (styleElement.getAttribute('w:val').lower() == styleName.lower()): 148 styleId = styleElement.parentNode.getAttribute('w:styleId') 149 return styleId 150 return None
151
152 -def _getParagraphStyleId(p):
153 '''Gets the style id of a paragraph element. 154 155 @param p: The paragraph element. 156 157 @return: The style id if it was found, otherwise returns None. 158 ''' 159 try: 160 styleId = p.getElementsByTagName('w:pStyle')[0].getAttribute('w:val') 161 except: 162 return None 163 return styleId
164 165
166 -def _getThemeFont(themeXml, styleDefinitions, themeFont):
167 '''Gets a themefont from theme1.xml. 168 169 @note: XML example: 170 171 <a:fontScheme name="Office"> 172 <a:majorFont> 173 <a:latin typeface="Cambria"/> 174 <a:ea typeface=""/> 175 <a:cs typeface=""/> 176 </a:majorFont> 177 <a:minorFont> 178 <a:latin typeface="Calibri"/> 179 <a:ea typeface=""/> 180 <a:cs typeface=""/> 181 </a:minorFont> 182 </a:fontScheme> 183 184 @param themeXml: theme1.xml as DOM tree. 185 @param styleDefinitions: The style definitions dict. 186 @see: _getCompleteStyleDefinitions. 187 @param themeFont: The theme font. Should be either 'majorFont' or 'minorFont'. 188 189 @return: The style definitions with or without changes. 190 191 ''' 192 if themeFont == "" or themeFont is None: 193 return styleDefinitions 194 195 fontElement = None 196 197 if themeFont.startswith('major'): 198 fontElement = themeXml.getElementsByTagName('a:majorFont')[0] 199 elif themeFont.startswith('minor'): 200 fontElement = themeXml.getElementsByTagName('a:minorFont')[0] 201 202 if fontElement is not None: 203 themeFont = fontElement.getElementsByTagName('a:latin')[0].getAttribute('typeface') 204 if themeFont.strip() != "": 205 styleDefinitions['w:ascii'] = themeFont 206 styleDefinitions['w:asciiTheme'] = None 207 208 return styleDefinitions
209
210 -def _getStyleDefinitions (element, styleDefinitions):
211 ''' Return style definitions of a given element. 212 First checks if the element has any children and uses recursion if some are found. 213 Next checks if the element has attributes. 214 - If the attribute name is a key in the dict, stores the value of the attribute. 215 - If the attribute name is 'w:val' and the element tag name is a key in the dict, stores the value of the attribute. 216 If the element tag name is a key in the dict and the element doesn't have any attributes or children, stores value '1' in the dict. 217 218 @param element: Style definitions are searched inside this element 219 @param styleDefinitions: A dict where the style definitions are stored. 220 May contain tag names or attribute names and some default values. 221 222 @return: The style definitions in a dict. 223 ''' 224 225 # <w:pPr> 226 # <w:pBdr> 227 # <w:top w:val="single" w:sz="24" w:space="0" w:color="4F81BD"/> 228 # <w:left w:val="single" w:sz="24" w:space="0" w:color="4F81BD"/> 229 # <w:bottom w:val="single" w:sz="24" w:space="0" w:color="4F81BD"/> 230 # <w:right w:val="single" w:sz="24" w:space="0" w:color="4F81BD"/> 231 # </w:pBdr> 232 # <w:shd w:val="clear" w:color="auto" w:fill="4F81BD"/> 233 # <w:spacing w:before="360" w:after="0"/> 234 # <w:outlineLvl w:val="0"/> 235 # </w:pPr> 236 # We are not interested in border style definitions as above at the moment. Implement here if needed later. 237 if element.tagName == "w:pBdr": 238 return styleDefinitions 239 240 for child in element.childNodes: 241 if child.nodeType != child.TEXT_NODE: 242 styleDefinitions = _getStyleDefinitions(child, styleDefinitions) 243 244 if element.hasAttributes(): 245 for i in range (0, element.attributes.length): 246 attributeName = element.attributes.item(i).name 247 if styleDefinitions.has_key(attributeName): 248 styleDefinitions[attributeName] = element.attributes.item(i).value 249 # if attributeName != "w:val": 250 # styleDefinitions[element.attributes.item(i).name] = element.attributes.item(i).value 251 elif styleDefinitions.has_key(element.tagName) and attributeName == "w:val": 252 styleDefinitions[element.tagName] = element.attributes.item(i).value 253 254 elif styleDefinitions.has_key(element.tagName) and element.hasChildNodes() is False: 255 styleDefinitions[element.tagName] = True 256 257 return styleDefinitions
258 259
260 -def getStyle(document, requirementStyleName):
261 '''Gets all definitions of a style from document dictionary. 262 263 Converts twips to centimeters. 264 265 @return: A dict with all the style definitions of the one style with the 266 translated keys to match return value odt_inspector's getStyle(). False, if the style was not found. 267 ''' 268 styleXml = document['word/styles.xml'] 269 themeXml = document['word/theme/theme1.xml'] 270 271 272 if _isStyleUsed(document, requirementStyleName) is False: 273 return False 274 275 #styleName is a capitalized string or CamelCase string (for example: Normal, Body Text) 276 #However, heading stylenames are in the xml in lower case (heading 1) 277 #Also stylenames such as toc 1, index 1, footer, header, caption are in lower case. 278 279 #The following fixes this broblem, as it gets the styleId by comparing lowercase stylename strings, 280 #and then gets the correct styleName by the styleId. 281 styleId = _getStyleIdByName(requirementStyleName, styleXml) 282 styleName = _getStyleNameById(styleId, styleXml) 283 284 styleDefinitions = _getCompleteStyleDefinitions(styleXml, styleName, themeXml) 285 286 if styleDefinitions is None: 287 return False 288 289 translateDict = {'w:name':'styleName', 290 'w:ascii':'fontName', 291 'w:sz':'fontSize', 292 'w:caps':'transform', 293 'w:left':'indentLeft', 294 'w:right':'indentRight', 295 'w:firstLine':'indentFirstLine', 296 'w:line':'linespacing', 297 'w:before':'spacingBefore', 298 'w:after':'spacingAfter', 299 'w:keepNext':'keepWithNext', 300 'w:jc':'alignment', 301 'w:widowControl':'widowControl', 302 #'w:widowControl':'widows', #TODO: widows kovakoodattuna 303 'w:b':'bold', 304 'w:i':'italic'} 305 styleDict = {} 306 307 # In case styleName is changed, let's return the same styleName as was given in the parameter. 308 styleDefinitions['w:name'] = requirementStyleName 309 # Line spacing single (1) = 12 points = 240 twips. 310 # <w:spacing w:before="300" w:after="340" w:line="240" w:lineRule="auto"/> 311 # For example: w:line="360" -> line spacing is 1.5 312 styleDefinitions['w:line'] = float(styleDefinitions['w:line']) / float(240.0) 313 314 styleDefinitions['w:sz'] = round(float(styleDefinitions['w:sz']) / 2, 1) # nyt voi olla 13.5 315 styleDefinitions['w:before'] = convertTwipToPt(float(styleDefinitions['w:before'])) 316 styleDefinitions['w:after'] = convertTwipToPt(float(styleDefinitions['w:after'])) 317 #FIXME: 318 # File "/var/www/virtual.hosts/sovellusprojektit.it.jyu.fi/parsi/sovellus/docx_inspector.py", line 244, in getStyle 319 # styleDefinitions['w:left'] = round( convertTwipToCm( float(styleDefinitions['w:left']) ), 1) 320 # ValueError: invalid literal for float(): single 321 styleDefinitions['w:left'] = round(convertTwipToCm(float(styleDefinitions['w:left'])), 1) 322 # Rounded with a precision of one decimal. If this is used more often, update to conversions.py. 323 324 for key in translateDict.keys(): 325 styleDict[translateDict[key]] = styleDefinitions[key] 326 327 return styleDict
328
329 -def _getCompleteStyleDefinitions(styleXml, styleName, themeXml):
330 ''' Returns the style definition of the given style from style.xml and theme1.xml. 331 Recursion used because the style can be based on some other style. 332 In addition, the base style gets style definitions from the document defaults. 333 Finally, some style definitions are not found in the XML file at all. These definitions use some default value which must be assumed. 334 335 @note: XML example: 336 337 <w:style w:type="paragraph" w:default="1" w:styleId="Normaali"> 338 <w:name w:val="Normal"/> 339 <w:qFormat/> 340 <w:rsid w:val="006B493C"/> 341 <w:pPr> 342 <w:spacing w:before="140" w:after="220" w:line="360" w:lineRule="auto"/> 343 <w:ind w:left="567"/> 344 <w:jc w:val="both"/> 345 </w:pPr> 346 <w:rPr> 347 <w:rFonts w:ascii="Georgia" w:hAnsi="Georgia"/> 348 <w:lang w:val="fi-FI"/> 349 </w:rPr> 350 </w:style> 351 352 @param styleXml: styles.xml-file as a DOM tree. 353 @param styleName: The name of the style (NOT the id) 354 @see: _getStyleElementById and _getStyleElementByName for difference. 355 @param themeXml: theme1.xml as DOM tree. 356 357 @return: Style definitions in a dict. 358 ''' 359 360 # If fontSize is not found mentioned in xml, the user has used the default size which is 12 (*2 = 24) for text body 361 # TODO: initialize dict with the default style definitions. 362 # TODO: some of the values are in twips, convert to cm. 363 364 # Complete style-element specification can be found at: http://www.schemacentral.com/sc/ooxml/e-w_style-1.html 365 # Style-element content for the most part: 366 # w:name [0..1] Primary Style Name 367 # w:aliases [0..1] Alternate Style Names 368 # w:basedOn [0..1] Parent Style ID 369 # w:next [0..1] Style For Next Paragraph 370 # w:link [0..1] Linked Style Reference 371 # w:autoRedefine [0..1] Automatically Merge User Formatting Into Style Definition 372 # w:hidden [0..1] Hide Style From User Interface 373 # w:semiHidden [0..1] Hide Style From Main User Interface 374 # w:unhideWhenUsed [0..1] Remove Semi-Hidden Property When Style Is Used 375 # w:qFormat [0..1] Primary Style 376 # w:locked [0..1] Style Cannot Be Applied 377 # w:pPr: - paragraph properties 378 # w:pStyle [0..1] Referenced Paragraph Style 379 # w:keepNext [0..1] Keep Paragraph With Next Paragraph 380 # w:keepLines [0..1] Keep All Lines On One Page 381 # w:pageBreakBefore [0..1] Start Paragraph on Next Page 382 # w:widowControl [0..1] Allow First/Last Line to Display on a Separate Page 383 # w:numPr [0..1] Numbering Definition Instance Reference 384 # w:numPr/w:numId [0..1] Numbering Definition Instance Reference 385 # w:spacing [0..1] Spacing Between Lines and Above/Below Paragraph 386 # w:before [0..1] Spacing Above Paragraph 387 # w:beforeLines [0..1] Spacing Above Paragraph IN Line Units 388 # w:beforeAutospacing [0..1] Automatically Determine Spacing Above Paragraph 389 # w:after [0..1] Spacing Below Paragraph 390 # w:afterLines [0..1] Spacing Below Paragraph in Line Units 391 # w:afterAutospacing [0..1] Automatically Determine Spacing Below Paragraph 392 # w:line [0..1] Spacing Between Lines in Paragraph 393 # w:lineRule [0..1] Type of Spacing Between Lines 394 # w:ind [0..1] Paragraph Indentation 395 # w:left [0..1] Left Indentation 396 # w:leftChars [0..1] Left Indentation in Character Units 397 # w:right [0..1] Right Indentation 398 # w:rightChars [0..1] Right Indentation in Character Units 399 # w:hanging [0..1] Indentation Removed from First Line 400 # w:hangingChars [0..1] Indentation Removed From First Line in Character Units 401 # w:firstLine [0..1] Additional First Line Indentation 402 # w:firstLineChars [0..1] Additional First Line Indentation in Character Units 403 # w:jc [0..1] Paragraph Alignment 404 # w:outlineLvl [0..1] Associated Outline Level 405 # w:rPr - run properties 406 # 2. w:rFonts [0..1] Run Fonts 407 # w:ascii [0..1] w:ST_String ASCII Font 408 # w:hAnsi [0..1] w:ST_String High ANSI Font 409 # w:cs [0..1] w:ST_String Complex Script Font 410 # w:asciiTheme [0..1] w:ST_Theme ASCII Theme Font 411 # w:hAnsiTheme [0..1] w:ST_Theme High ANSI Theme Font 412 # w:cstheme [0..1] w:ST_Theme Complex Script Theme Font 413 # 3. w:b [0..1] Bold 414 # 4. w:bCs [0..1] Complex Script Bold 415 # 5. w:i [0..1] Italics 416 # 6. w:iCs [0..1] Complex Script Italics 417 # 7. w:caps [0..1] Display All Characters As Capital Letters 418 # 8. w:smallCaps [0..1] Small Caps 419 # 9. w:strike [0..1] Single Strikethrough 420 # 10. w:dstrike [0..1] Double Strikethrough 421 # 11. w:outline [0..1] Display Character Outline 422 # 15. w:noProof [0..1] Do Not Check Spelling or Grammar 423 # 24. w:sz [0..1] Font Size 424 # 25. w:szCs [0..1] Complex Script Font Size 425 # 27. w:u [0..1] Underline 426 # 36. w:lang [0..1] Languages for Run Content 427 428 # Initialize the styleDefinitions dict with the style definitions you wish to search and their default values. 429 # Rules of initialization: 430 # 1. Use attribute name as a key. 431 # 2. If attribute name is 'w:val', use element tag name as a key. 432 # 3. If the element can be empty and attributes are optional, use element tag name as a key. 433 styleDefinitions = {'w:name': None, 434 'w:basedOn': None, 435 'w:next': None, 436 'w:link': None, 437 'w:autoRedefine': None, 438 'w:left': '0', 439 'w:right': '0', 440 'w:firstLine': None, 441 'w:line': '240.0', 442 'w:before': '0', 443 'w:after': '0', 444 'w:widowControl': True, # on as default 445 'w:jc': 'left', #left as default 446 'w:sz': '24', 447 'w:ascii': None, 448 'w:asciiTheme': None, 449 'w:b': False, # default 450 'w:i': False, # default 451 'w:u': False, # default 452 #'w:outline': None, 453 #'w:numId': None, 454 #'w:ilvl': None, 455 'w:lang': None, 456 'w:keepNext': None, 457 'w:keepLines': None, 458 'w:pageBreakBefore': None, 459 'w:caps': None} 460 461 styleElement = _getStyleElementByName(styleName, styleXml) 462 463 if styleElement is None: 464 return 465 466 basedOnStyleId = _getBasedOnStyleId(styleName, styleXml) 467 468 # Recursion: if this style has a basedOn-style, method calls itself with the basedOnStyleName. 469 # When style has no basedOn-style, get the definitions in w:docDefaults-element first. 470 if basedOnStyleId is not None: 471 styleDefinitions = _getCompleteStyleDefinitions(styleXml, _getStyleNameById(basedOnStyleId, styleXml), themeXml) 472 elif basedOnStyleId is None: 473 styleDefinitions = _getStyleDefinitions(styleXml.getElementsByTagName('w:docDefaults')[0], styleDefinitions) 474 475 styleDefinitions = _getStyleDefinitions(styleElement, styleDefinitions) 476 477 if(styleDefinitions.has_key('w:asciiTheme')): 478 styleDefinitions = _getThemeFont(themeXml, styleDefinitions, styleDefinitions['w:asciiTheme']) 479 480 return styleDefinitions
481
482 -def _getElementValueWithinElement(elementTagName, element):
483 '''Gets the text content of the first element with a certain tag in the given DOM tree. 484 485 @return: The text value of the element, or None if something went wrong. 486 ''' 487 try: 488 value = element.getElementsByTagName(elementTagName)[0].firstChild.nodeValue 489 return value 490 except: 491 return None
492
493 -def _getElementWithinElement(element, elementTagName):
494 '''Gets the first child of an element with the given tag name. 495 496 Returns the element of a the given parent element with the given elementTagName. 497 498 @param element: The element whose children are searched. 499 @param elementTagName: The tag name of the wanted element. 500 501 @return: An element with the right tag name, or None if it wasn't found. 502 ''' 503 try: 504 value = element.getElementsByTagName(elementTagName)[0] 505 except IndexError: 506 return None 507 return value
508
509 -def _getElementsWithinElement(element, elementTagName):
510 '''Gets the children of an element with the given tag name. 511 512 Returns the element of the given parent element by the given elementTagName. 513 514 @param element: The element whose children are searched. 515 @param elementTagName: Tag name of the wanted elements. 516 517 @return: The list of elements with the right tag name, or None if none was found. 518 ''' 519 elements = element.getElementsByTagName(elementTagName) 520 if len(elements) == 0: 521 return None 522 return elements
523 524 #def _getElementsWithinElement(elementTagName, xmlData): 525 # ''' Returns the elements of the given xml file if xmlData and the given elementTagName 526 # exist in the xmlData. 527 # 528 # xmlData - the given xml file''' 529 # elements = xmlData.getElementsByTagName(elementTagName) 530 # if len(elements) == 0: 531 # return None 532 # return elements 533 534
535 -def _getTargetXmlFileByHeader(header, document):
536 '''Gets a header reference target xml-file as a DOM tree.''' 537 #TODO: try except 538 targetFile = getRelsTargetByRId(header.getAttribute('r:id'), document['word/_rels/document.xml.rels']) 539 targetFileXml = document['word/' + targetFile] # omaan metodiin 540 #targetFileXml = minidom.parseString(targetFile) 541 return targetFileXml 542
543 -def _checkFrontPageHeadersAndFooters(references, document):
544 '''Goes through header or footer references and checks if there is any content in them. 545 546 Checks if there are headers or footers in the front page by looking for <w:t> tags. 547 Even if there are references to headers or footers, they might be empty. 548 549 @param references: Header or footer references. 550 551 @return: The header or footer target XML file as a DOM tree, or None if no headers or footers were found. 552 ''' 553 554 if references is not None: 555 for header in references: 556 targetFileXml = _getTargetXmlFileByHeader(header, document) 557 #TODO: eksaktimmin, _getElementValueWithinElement? 558 if _getElementsWithinElement(targetFileXml, 'w:t') is not None: 559 return targetFileXml
560 561
562 -def _checkAutomaticPageNumbering(section, headerReference, footerReference, document, errorIds, numStartKey):
563 '''Checks if a section has an automatic page numbering and gets the numbering format. 564 565 First goes through the section element and checks that the numbering starts at 1. 566 Gets the section numbering of format definition. 567 If it is defined, returns it. 568 If a numbering format is not found in the section properties, it defaults to 'Standard'. 569 If the numbering format is standard, checks the header and footer references for other numbering format definitions. 570 The numbering format in the header or the footer reference is sometimes in <w:instrText> element inside the content of PAGE \* MERGEFORMAT. 571 572 @param section: The section element to be searched for. 573 @param headerReference: The current header of the section element as a DOM tree. 574 @param footerReference: The current footer of the section element as a DOM tree. 575 @param document: The document as a dict of DOM tree pairs. 576 @param errorIds: The dict for appending errors True/False. 577 @param numStartKey: The key for errorIds to append numbering start error. 578 579 @return: The page numbering as a string format, or False if there was no page numbering or the numbering was both in header and footer. 580 ''' 581 pgNumTypeElement = _getElementWithinElement(section, 'w:pgNumType') 582 if pgNumTypeElement is None: 583 return False 584 585 startNum = pgNumTypeElement.getAttribute('w:start') 586 if str(startNum) != '1': 587 errorIds[numStartKey] = False 588 else: 589 errorIds[numStartKey] = True 590 591 numFormat = _getPgNumFormat(pgNumTypeElement) 592 593 #1. check if instrtext in referenceXml. 594 #2. check if 'page' and 'mergeformat' texts are found -> page numbering field is found 595 #3. check if there is a pagenumbering format definition in instrtext and return it 596 #4. else return numFormat = "Standard" 597 if numFormat == "Standard": 598 headerFormat = None 599 if headerReference is not None: 600 601 elementValue = _getElementValueWithinElement('w:instrText', headerReference) 602 if elementValue is None: 603 headerFormat = None 604 elif elementValue.find('PAGE') and elementValue.find('MERGEFORMAT'): 605 splitted = elementValue.split('\*') 606 if len(splitted) > 2: 607 headerFormat = splitted[1].lower().strip() 608 609 footerFormat = None 610 if footerReference is not None: 611 612 elementValue = _getElementValueWithinElement('w:instrText', headerReference) 613 if elementValue is None: 614 footerFormat = None 615 elif elementValue.find('PAGE') and elementValue.find('MERGEFORMAT'): 616 splitted = elementValue.split('\*') 617 if len(splitted) > 2: 618 footerFormat = splitted[1].lower().strip() 619 620 if headerFormat is not None and footerFormat is not None: 621 #TODO: numbering in both header and footer. 622 return False 623 elif headerFormat is not None and headerFormat != "Standard": 624 return headerFormat 625 elif footerFormat is not None and footerFormat != "Standard": 626 return footerFormat 627 628 return numFormat
629
630 -def _checkNameInHeaderOrFooter(reference, document):
631 '''Looks for text inside a header or footer and sees if the last modifier's name is in there. 632 633 Problem: sometimes we want to check that there is no name in the header or the footer. 634 If a name is found but it's different from the last modifier's name, result is False, even though a name is in a header/footer. 635 For now just tries to check that either the name of the last modifier or just some name was found. 636 637 @param reference: The header or footer XML file as a DOM tree. 638 639 @return: True if a name is found in the text, False otherwise. 640 ''' 641 if reference is not None: 642 pElements = _getElementsWithinElement(reference, 'w:p') 643 for pElement in pElements: 644 textContent = getTextContent(pElement) 645 646 647 #FIXME: if the name is different in the text and in the settings, this could give false negatives: 648 #For example, if we don't want that toc section has a name in footer or header and the name is different in the text 649 #and in the settings, this function returns False even though there is a name in the header or footer. 650 if textContent.find(_getLastModifier(document["docProps/core.xml"])) != -1: 651 return True 652 653 #FIXME: fix the problem above, then see if this is necessary: 654 #When the following if is done, the function can return True even though the name is different in the header or footer 655 #and document settings. 656 657 #check if the content is a digit -> page number 658 #split at whitespace -> len > 1 -> probably a name! 659 #check if the content is longer than 3 characters -> probably a name. 660 if textContent.find('PAGE') == -1: 661 strippedContent = textContent.strip() 662 if strippedContent.isdigit(): 663 continue 664 splittedContent = strippedContent.split() 665 if len(splittedContent) > 1: 666 return True 667 if len(strippedContent) > 3: 668 return True 669 670 return False
671 672
673 -def _getPgNumFormat(sectionPgNumType):
674 ''' Gets the number format of the given section page number type. 675 676 @param sectionPgNumType: The given page number type element of the section 677 678 @return: The numbering format, defaults to 'Standard' if nothing else is defined. 679 ''' 680 numFormat = 'Standard' 681 if sectionPgNumType: 682 if sectionPgNumType.hasAttribute('w:fmt'): 683 numFormat = sectionPgNumType.getAttribute('w:fmt') 684 685 return numFormat
686
687 -def checkHeadersAndFooters(document):
688 ''' Checks that the headers and footers of a document are made correctly. 689 690 Assumes that the document has three sections: 691 1. cover section 692 2. table of contents section or toc section 693 3. actual content section or text section 694 695 @see: checkSections method must pass in order to run this method 696 697 @note: 698 Places findings in the errorIds-dict as key-boolean pairs: 699 700 'frontPage': was there headers or footers in the cover section. 701 702 'tocPageNumbering': is there a page numbering in the toc section. 703 704 'differentPageNumbering': is the page numbering different in the cover and text sections. 705 706 'nameInToc': is the last modifiers name in toc section header or footer. 707 708 'nameInText': is the last modifiers name in text section header or footer. 709 710 'pageNumbering': is there a page numbering in the text section. 711 712 'tocNumStart': does the toc section page numbering start at 1. 713 714 'textNumStart': does the text section page numbering start at 1. 715 716 'titlePg': is the Microsoft Office setting "Different first page" on. 717 718 @note: XML example: 719 720 <w:pgNumType w:fmt="lowerRoman" w:start="1"/>) 721 722 <w:pgNumType w:start="1"/> 723 724 @return: Findings in the errorIds-dict as key-boolean pairs as described above. 725 ''' 726 727 docXml = document['word/document.xml'] 728 729 allSectionProperties = getSectionElementsBySections(docXml) 730 731 errorIds = {'frontPage': None, 'tocPageNumbering': None, 'differentPageNumbering': None, 732 'nameInToc': None, 'nameInText': None, 'pageNumbering': None, 'tocNumStart': None, 733 'textNumStart': None, 'titlePg': None} # @see XML requirement file 734 735 currentHdrRef = None 736 currentFtrRef = None 737 738 #allSectionProperties[0] = cover page 739 #allSectionProperties[1] = table of contents page 740 #allSectionProperties[2] = actual document section 741 742 coverSection = allSectionProperties[0] 743 tocSection = allSectionProperties[1] 744 commonSection = allSectionProperties[2] 745 746 for coverSectPr in coverSection: 747 748 frontPageHeaderReferences = _getElementsWithinElement(coverSectPr, 'w:headerReference') 749 frontPageFooterReferences = _getElementsWithinElement(coverSectPr, 'w:footerReference') 750 751 frontPageHdrRef = _checkFrontPageHeadersAndFooters(frontPageHeaderReferences, document) 752 if frontPageHdrRef is not None: 753 currentHdrRef = frontPageHdrRef 754 755 frontPageFtrRef = _checkFrontPageHeadersAndFooters(frontPageFooterReferences, document) 756 if frontPageFtrRef is not None: 757 currentFtrRef = frontPageFtrRef 758 759 if currentHdrRef is not None or currentFtrRef is not None: 760 errorIds["frontPage"] = True 761 else: 762 errorIds["frontPage"] = False 763 764 765 tocSectionNumberingFormat = None 766 for tocSectPr in tocSection: 767 768 tocHdrRefs = _getElementsWithinElement(tocSectPr, 'w:headerReference') 769 tocFtrRefs = _getElementsWithinElement(tocSectPr, 'w:footerReference') 770 771 tocHdrRef = _checkFrontPageHeadersAndFooters(tocHdrRefs, document) 772 if tocHdrRef is not None: 773 currentHdrRef = tocHdrRef 774 775 tocFtrRef = _checkFrontPageHeadersAndFooters(tocFtrRefs, document) 776 if tocFtrRef is not None: 777 currentFtrRef = tocFtrRef 778 779 tocSectionNumberingFormat = _checkAutomaticPageNumbering(tocSectPr, currentHdrRef, currentFtrRef, document, errorIds, "tocNumStart") 780 781 if tocSectionNumberingFormat is not False: 782 #PAGE NUMBERING IN HEADER AND FOOTER: 783 errorIds['tocPageNumbering'] = True 784 break 785 else: 786 errorIds['tocPageNumbering'] = False 787 788 # is document's writer's name in tocSection header or footer? 789 if _checkNameInHeaderOrFooter(currentHdrRef, document) is False and \ 790 _checkNameInHeaderOrFooter(currentFtrRef, document) is False: 791 errorIds['nameInToc'] = False 792 else: errorIds['nameInToc'] = True 793 794 795 commonSectionNumberingFormat = None 796 for commonSectPr in commonSection: 797 commonHdrRefs = _getElementsWithinElement(commonSectPr, 'w:headerReference') 798 commonFtrRefs = _getElementsWithinElement(commonSectPr, 'w:footerReference') 799 800 commonHdrRef = _checkFrontPageHeadersAndFooters(commonHdrRefs, document) 801 if commonHdrRef is not None: 802 currentHdrRef = commonHdrRef 803 804 commonFtrRef = _checkFrontPageHeadersAndFooters(commonFtrRefs, document) 805 if commonFtrRef is not None: 806 currentFtrRef = commonFtrRef 807 808 commonSectionNumberingFormat = _checkAutomaticPageNumbering(commonSectPr, currentHdrRef, currentFtrRef, document, errorIds, "textNumStart") 809 810 if commonSectionNumberingFormat is not False: 811 #PAGE NUMBERING IN HEADER AND FOOTER: 812 errorIds['pageNumbering'] = True 813 break 814 else: 815 errorIds['pageNumbering'] = False 816 817 # is the document's maker's name in the body part of the document 818 if _checkNameInHeaderOrFooter(currentHdrRef, document) is True or \ 819 _checkNameInHeaderOrFooter(currentFtrRef, document) is True: 820 errorIds['nameInText'] = True 821 else: errorIds['nameInText'] = False 822 823 if tocSectionNumberingFormat is not False and commonSectionNumberingFormat is not False: 824 if tocSectionNumberingFormat != commonSectionNumberingFormat: 825 errorIds['differentPageNumbering'] = True 826 else: 827 errorIds['differentPageNumbering'] = False 828 else: 829 errorIds['differentPageNumbering'] = False 830 831 return errorIds
832
833 -def getParagraphElementsBySections(docXml, sectionName):
834 '''Get paragraph elements of the wanted section. 835 The page breaking section break elements changes section, continuous section brake elements don't change section. 836 837 The first list of the section elements is the cover section. 838 The second list of the section elements is the table of contents-section. 839 The third list of the section elements is the text section. 840 The document has to have at least 3 sections. 841 842 @param docXml: The document.xml file as a DOM tree. 843 @param sectionName: The wanted section can be 'cover', 'toc' or 'text'. 844 845 @return: The list of the section elements. 846 ''' 847 848 sectionList = [[]] 849 #sectionList = [[w:p],[w:p,w:p],[w:p]] 850 bodyElement = docXml.getElementsByTagName('w:body')[0] # always exactly 1 element 851 852 i = 0 853 854 for textP in bodyElement.childNodes: 855 sectionList[i].append(textP) 856 sectPrs = textP.getElementsByTagName('w:sectPr') 857 if len(sectPrs) != 0: 858 typeElement = _getElementWithinElement(sectPrs[0], 'w:type') 859 if typeElement is not None: 860 if typeElement.getAttribute('w:val') == 'continuous': 861 continue 862 else: 863 i += 1 864 sectionList.append([]) 865 866 sectionElements = {'cover':sectionList[0], 'toc':sectionList[1], 'text':sectionList[2]} 867 if sectionElements.has_key(sectionName): 868 return sectionElements[sectionName]
869
870 -def getSectionElementsBySections(docXml, index = None):
871 '''Gets all the w:sectPr elements of a document or optionally the w:sectPr elements of a specific section. 872 873 w:sectPr elements are stored in a two dimensional list. 874 Continuous section breaks are appended to current outer list index. 875 The page breaking section raises the outer list index. 876 877 @param index: The index of the outer pageSections list that is get. None by default. 878 879 @return: The two dimensional list of all w:sectPr elements if index is None. Otherwise returns the list at the given index. 880 ''' 881 sectionElements = _getElementsWithinElement(docXml, "w:sectPr") 882 883 pageSections = [[]] 884 885 i = 0 886 887 for section in sectionElements: 888 typeElement = _getElementWithinElement(section, 'w:type') 889 if typeElement is not None: 890 if typeElement.getAttribute('w:val') == 'continuous': 891 pageSections[i].append(section) 892 else: 893 pageSections[i].append(section) 894 pageSections.append([]) 895 i += 1 896 897 if len(pageSections[len(pageSections) - 1]) == 0: 898 pageSections.remove([]) 899 900 if index is None: 901 return pageSections 902 else: 903 return pageSections[index]
904
905 -def _areSectionsOverlapping(outerParagraphElements, innerParagraphElements, errorList, errorMsg, expectedResult):
906 '''Goes through two lists of paragraph elements checking if the same paragraph is in both lists. 907 908 @param outerParagraphElements: The outer paragraphlist to be searched for. 909 @param innerParagraphElements: The inner pagraphlist to be searched for. 910 @param errorList: The list for appending error messages. 911 @param errorMsg: The error message to be appended. 912 @param expectedResult: Boolean of the expected result. 913 914 @return: expectedResult changed or unchanged. 915 ''' 916 found = False 917 for coverElement in outerParagraphElements: 918 if found is True: 919 break 920 for element in innerParagraphElements: 921 if coverElement.isSameNode(element): # is the table of contents in the cover section, where it shouldn't be 922 expectedResult = not expectedResult 923 errorList.append(errorMsg) 924 found = True 925 break 926 927 return expectedResult
928
929 -def checkSections(document, errorList):
930 '''Goes through the section elements in the document checking that the sections are done properly. 931 932 There must be at least three sections in the document. 933 The cover page and the table of the contents cannot be in the same section. 934 Also checks that the Microsoft Office Word setting "Different first page" is off. 935 936 @return: True if everything went well, False if something went terribly wrong or 937 error list if an error was found and the checking could be completed. 938 ''' 939 docXml = document['word/document.xml'] 940 styleXml = document['word/styles.xml'] 941 cover = True 942 toc = False 943 944 allSectionProperties = getSectionElementsBySections(docXml) 945 946 if len(allSectionProperties) < 3: 947 return False 948 949 for section in allSectionProperties: 950 for sectPr in section: 951 if len(sectPr.getElementsByTagName('w:titlePg')) > 0: 952 errorList.append('titlePg') 953 return False 954 955 tocParagraphs = _getParagraphElementsBySequentialStyleName('toc', styleXml, docXml) 956 coverSectionParagraphs = getParagraphElementsBySections(docXml, 'cover') # cover page 957 tocSectionParagraphs = getParagraphElementsBySections(docXml, 'toc') 958 959 cover = _areSectionsOverlapping(coverSectionParagraphs, tocParagraphs, errorList, "cover", cover) 960 961 toc = _areSectionsOverlapping(tocSectionParagraphs, tocParagraphs, errorList, "toc", toc) 962 963 # TODO: error handling, errorIdsAndPositions 964 if toc is True and cover is True: 965 return True 966 else: 967 return errorList
968 969 #def getSectionProperties(document): 970 # ''' Checks all the sectPr-elements. There must be at least three section breaks (first page, toc-page and content) in the whole document. 971 # The margins should be the same throughout the whole document? 972 # 973 # @return: section properties in a dict''' 974 # 975 # docXml = document['word/document.xml'] 976 # 977 # finalSectionProperties = dict([['w:w', None], ['w:h', None], ['w:top', None], ['w:right', None], ['w:bottom', None], \ 978 # ['w:left', None], ['w:right', None], ['w:header', None], ['w:footer', None], \ 979 # ['w:gutter', None], ['w:start', None], ['w:space', None], ['w:linePitch', None], ['w:titlePg', None]]) 980 # 981 # allSectionProperties = _getElements('w:sectPr', docXml) # returns a nodeList 982 # 983 # for element in allSectionProperties: 984 # finalSectionProperties = _getStyleDefinitions(element, finalSectionProperties) 985 # #TODO: conversions from twips to cm 986 # 987 # 988 # return finalSectionProperties 989 990
991 -def _checkPageProperties(allSectionProperties, pageProperties, tagName):
992 ''' Goes through all section properties to see that they have coherent property values. 993 994 If the property value is the same in all section elements, the value is stored in pageProperties. 995 If something is different between the sections, it's wrong and the page property is set False. 996 For example, if two different section elements have different page top marginal, the property is set False. 997 998 @param allSectionProperties: All w:sectPr elements of the document. 999 @param pageProperties: the allowed page properties are {'top': None, 'right': None, 'bottom': None, 'left': None} or {'w': None, 'h': None}. 1000 @param tagName: Tag name of the element whose properties are checked. 1001 1002 @return: pageProperties dict with coherent page values and incoherent values set as False. 1003 ''' 1004 1005 for element in allSectionProperties: 1006 for key in pageProperties.keys(): 1007 size = _getElementWithinElement(element, tagName).getAttribute('w:' + key) 1008 if pageProperties[key] is None: 1009 pageProperties[key] = size 1010 elif pageProperties[key] != size: 1011 pageProperties[key] = False # False means that the information has bee changed. 1012 1013 return pageProperties
1014
1015 -def _convertSizes(sizesDict):
1016 for key in sizesDict.keys(): 1017 value = sizesDict[key] 1018 value = convertTwipToCm(int(value)) 1019 roundedValue = round(value, 1) # pyöristetään yhden desimaalin tarkkuudella 1020 sizesDict[key] = roundedValue 1021 1022 return sizesDict
1023
1024 -def getPageMarginals(document):
1025 '''Gets the document page marginals sizes. 1026 1027 @return: False if the marginals are not coherent, otherwise a dictionary containing the marginal sizes. 1028 ''' 1029 allSectionProperties = _getElementsWithinElement(document['word/document.xml'], 'w:sectPr') 1030 1031 pageMarginals = {'top': None, 'right': None, 'bottom': None, 'left': None} 1032 # { 'header': None, 'footer': None} 1033 # Header and footer sizes can be different in different sections of the document. 1034 # For example if one section does not have header of footer at all and other section has them, the size is different. 1035 1036 _checkPageProperties(allSectionProperties, pageMarginals, 'w:pgMar') 1037 1038 for key in pageMarginals.keys(): 1039 if pageMarginals[key] == False: 1040 return False 1041 1042 return _convertSizes(pageMarginals)
1043
1044 -def getPageSize(document):
1045 '''Gets the document page sizes. 1046 1047 @return: False if the page sizes are not coherent, otherwise a dictionary containing the page width and length. 1048 ''' 1049 allSectionProperties = _getElementsWithinElement(document['word/document.xml'], 'w:sectPr') 1050 1051 pageSize = {'w': None, 'h': None} # width ja heigth 1052 1053 _checkPageProperties(allSectionProperties, pageSize, 'w:pgSz') 1054 1055 finalPageSize = {'width': pageSize['w'], 'height':pageSize['h']} 1056 1057 for key in finalPageSize.keys(): 1058 if finalPageSize[key] == False: 1059 return False 1060 1061 return _convertSizes(finalPageSize)
1062 1063
1064 -def _getTitle(coreXml):
1065 '''Gets the title as set in document setting, None if not found.''' 1066 return _getElementValueWithinElement('dc:title', coreXml)
1067
1068 -def _getCreator(coreXml):
1069 '''Gets the document creator as set in document setting, None if not found.''' 1070 # return mso_meta_inspector._getCreator(coreXml) 1071 return _getElementValueWithinElement('dc:creator', coreXml)
1072
1073 -def _getLastModifier(coreXml):
1074 '''Gets the document last modifier as set in document setting, None if not found.''' 1075 return _getElementValueWithinElement('cp:lastModifiedBy', coreXml)
1076
1077 -def _getCreateDate(coreXml):
1078 '''Gets the document creatin date as found in document setting, None if not found.''' 1079 return _getElementValueWithinElement('dcterms:created', coreXml)
1080
1081 -def _getLastModifiedDate(coreXml):
1082 '''Gets the document last modified date as found in document setting, None if not found.''' 1083 return _getElementValueWithinElement('dcterms:modified', coreXml)
1084
1085 -def _getRevision(coreXml):
1086 '''Gets revision of the document as found in document setting, None if not found.''' 1087 return _getElementValueWithinElement('cp:revision', coreXml)
1088 1089
1090 -def _getTextFromParagraph(paragraph):
1091 '''Gets the text content of <w:t>-elements from the given (paragraph) element. 1092 1093 @return: the text content as a string. 1094 ''' 1095 eventualText = '' 1096 1097 # gets all text elements from the given paragraphs 1098 textElements = _getElementsWithinElement(paragraph, 'w:t') 1099 if textElements is not None: 1100 for i in textElements: 1101 eventualText += i.firstChild.nodeValue 1102 return eventualText
1103
1104 -def checkTocContent(document):
1105 '''Checks if all of the headings created in the document are listed in the table of contents. 1106 1107 @return: True if toc matches the headings content, False otherwise. 1108 ''' 1109 1110 docXml = document['word/document.xml'] 1111 styleXml = document['word/styles.xml'] 1112 1113 headingParagraphs = _getParagraphElementsBySequentialStyleName("heading", styleXml , docXml) 1114 #docHeadings = _getTextFromParagraph(headingParagraphs) 1115 1116 docTocStyles = _getParagraphElementsBySequentialStyleName('toc', styleXml, docXml) 1117 #tocHeadings = _getTextFromParagraph(docTocStyles) 1118 1119 docHeadings = [] 1120 for heading in headingParagraphs: 1121 headingText = _getTextFromParagraph(heading).strip() 1122 if headingText != "": 1123 docHeadings.append(headingText) 1124 1125 1126 tocHeadings = [] 1127 for tocStyle in docTocStyles: 1128 tocHeadings.append(_getTextFromParagraph(tocStyle).strip()) 1129 1130 docHeadingsLength = len(docHeadings) 1131 tocHeadingsLength = len(tocHeadings) 1132 listLength = len(docHeadings) # oletuksena docHeadingsien pituus 1133 1134 if docHeadingsLength != tocHeadingsLength: 1135 return False 1136 # errors.append('Sisällysluettelon otsikkoja on eri määrä kuin dokumentin otsikkoja.') 1137 #------------------------ # if-else shorthand: x = z if condition else y 1138 # if docHeadingsLength > tocHeadingsLength: listLength = tocHeadingsLength 1139 #--------------------------------- else: listLength = docHeadingsLength 1140 1141 i = 0 1142 while i < listLength: 1143 # if string.find (tocHeadings[i], docHeadings[i]) == -1: 1144 if tocHeadings[i].find(docHeadings[i]) == -1: 1145 return False 1146 i += 1 1147 1148 bookmarkStarts = _getElementsWithinElement(docXml, 'w:bookmarkStart') 1149 instrElements = _getElementsWithinElement(docXml, 'w:instrText') 1150 for element in docTocStyles: 1151 instrElements = _getElementsWithinElement(element, 'w:instrText') 1152 for instrElement in instrElements: 1153 instrElementValue = instrElement.firstChild.nodeValue 1154 if instrElementValue.find('PAGEREF') != -1: # We only want to handle the tags with value including PAGEREF. 1155 for bookmark in bookmarkStarts: 1156 bookmarkNameValue = bookmark.getAttribute('w:name') 1157 if bookmarkNameValue.find(instrElementValue) != -1: # if the same code is in bookmarkStart 1158 return False 1159 1160 return True
1161
1162 -def checkTOC(document):
1163 ''' Check if table of contents is done correctly. It has to have a page break before (and after) it. 1164 1165 @see: checkTocContent -- calls the method if there's a table of contents to be found. 1166 1167 @note: XML example: 1168 1169 <w:p w:rsidR="004A16ED" w:rsidRDefault="004A16ED" w:rsidP="006158B0"> 1170 1171 <w:pPr> 1172 1173 1174 <w:pStyle w:val="Otsikko"/> 1175 1176 </w:pPr> 1177 1178 <w:r w:rsidRPr="006158B0"> 1179 1180 <w:lastRenderedPageBreak/> 1181 1182 <w:t>SISALLYSLUETTELO</w:t> 1183 1184 </w:r> 1185 1186 </w:p> 1187 1188 <w:p w:rsidR="002274FC" w:rsidRDefault="00FA6E61"> 1189 1190 <w:pPr> 1191 1192 <w:pStyle w:val="Sisluet1"/> 1193 1194 @return: True if toc is made correctly, False otherwise. 1195 ''' 1196 1197 styleId = _getStyleIdByName("toc 1", document['word/styles.xml']) # Sisluet1 1198 #if (styleId == None): errors.append('There is no table of contents.') 1199 pStyles = _getElementsWithinElement(document['word/document.xml'], 'w:pStyle') 1200 1201 if pStyles is None: 1202 return False 1203 1204 for style in pStyles: 1205 if (style.getAttribute('w:val') == styleId): 1206 return True 1207 1208 #checkTocContent() # tarkistaa onko sisällysluettelo päivitetty 1209 1210 1211 # try: #TODO: should a page break after the table of contents as well, what about section break? 1212 # #TODO: it's about section break, this code was made for page break. 1213 # if (pageBreak == style.parentNode.parentNode.previousSibling.getElementsByTagName('w:r')[0].firstChild.tagName): 1214 # print "There is a page break before table of contents." 1215 # return 1216 # except IndexError: 1217 # errors.append("There is no page break before the table of contents.") 1218 # return 1219 1220 return False
1221 #if not tocExists: errors.append("There is no table of contents at all.") 1222 1223 1224 #return styleId 1225
1226 -def checkCoverPage(document):
1227 ''' Checks if the front page is done correctly 1228 1229 @return: coverPageText dictionary containing True or False values. 1230 ''' 1231 1232 # if rakenne on oikein, do -- rakenteen tarkastus lisättävä alkuun 1233 1234 coverPageText = { 'email': False, 1235 'name': False, 1236 'title': False } 1237 1238 docXml = document['word/document.xml'] 1239 coreXml = document['docProps/core.xml'] 1240 1241 paragraphs = _getElementsWithinElement(docXml, 'w:p') 1242 allSectionProperties = getSectionElementsBySections(docXml, 0) 1243 1244 firstPageText = '' 1245 lastParagraphOfFirstPage = allSectionProperties[-1].parentNode.parentNode 1246 1247 # Saves the content of the first page, getting text from the beginning and 1248 # breaks the loop when the sectPr-node appears. 1249 # Pitää katsoa, ettei etusivulla ole sisällysluetteloa ym., koska jos section-breakit on väärin, 1250 # "etusivun" tietoihin voi valua sisällysluettelo. 1251 for element in paragraphs: 1252 firstPageText += getTextContent(element) 1253 if element.isSameNode(lastParagraphOfFirstPage): 1254 break 1255 1256 # title pitää myös löytyä täältä 1257 if firstPageText.find('@') != -1: # sähköpostiosoite tulee lopuksi käyttöliittymästä 1258 coverPageText['email'] = True 1259 if firstPageText.find(_getLastModifier(coreXml)) != -1: 1260 coverPageText['name'] = True 1261 title = _getTitle(coreXml) 1262 if title is not None: 1263 if firstPageText.find(title) != -1: 1264 coverPageText['title'] = True 1265 1266 return coverPageText
1267
1268 -def getRelsTargetByRId(rId, rels):
1269 '''Returns the value of Target attribute of a Relationship element with the given id in a given rels file. 1270 The value of Target attribute can be for example a relative path to local XML files or images. It can also be a hyperlink. 1271 1272 @param rId: Id attribute value of a Relationship element. 1273 @param rels: rels file as a DOM tree. 1274 1275 @return: The value of Target attribute if found. 1276 ''' 1277 for relationship in rels.getElementsByTagName('Relationship'): 1278 if rId == relationship.getAttribute('Id'): 1279 return relationship.getAttribute('Target')
1280
1281 -def getParentParagraph(element, tag='w:p'):
1282 ''' Returns the parent <w:p>-element of a given element if there is one. 1283 1284 @param element: The element whose parent <w:p> element is searched for. 1285 @param tag: The parent tagname, defaults to 'w:p'. 1286 1287 @return: The parent element, or None if no parent is found. 1288 ''' 1289 1290 parent = element.parentNode 1291 1292 while parent is not None: 1293 try: 1294 if parent.tagName == tag: 1295 return parent 1296 else: 1297 parent = parent.parentNode 1298 except AttributeError: 1299 return None 1300 return None
1301
1302 -def checkImages(document):
1303 ''' Check if there is an image in the document. 1304 1305 @return: True if even one image is found, False otherwise. 1306 ''' 1307 1308 #TODO: what is the difference between pic:pic and w:pict? 1309 #w:pict is used when pasting a chart from powerpoint or excel? 1310 picElements = document['word/document.xml'].getElementsByTagName('pic:pic') 1311 pictElements = document['word/document.xml'].getElementsByTagName('w:pict') 1312 1313 if len(picElements) > 0: 1314 return True 1315 if len(pictElements) > 0: 1316 return True 1317 return False
1318
1319 -def getImagePaths(document):
1320 ''' Gets the image paths or the file names of the images used in the document. 1321 1322 @return: The image targets as strings in a list. 1323 ''' 1324 targets = [] 1325 1326 picElements = document['word/document.xml'].getElementsByTagName('pic:pic') 1327 picElements += document['word/document.xml'].getElementsByTagName('w:pict') 1328 1329 for picElement in picElements: 1330 picRId = picElement.getElementsByTagName('a:blip')[0].getAttribute('r:embed') 1331 targets.append(getRelsTargetByRId(picRId, document['word/_rels/document.xml.rels'])) 1332 return targets
1333
1334 -def checkImageCaptions(document):
1335 '''Checks if the next paragraph after a picture paragraph uses the caption style. 1336 1337 Also checks that the caption contains an automatic field. 1338 Goes through all picture paragraphs. 1339 1340 @return: True if all images have captions, False otherwise. 1341 ''' 1342 # <w:p w:rsidR="0011423F" w:rsidRDefault="00FE23CD" w:rsidP="00A72640"> 1343 # <w:pPr> 1344 # <w:pStyle w:val="Kuvanotsikko"/> 1345 # </w:pPr> 1346 # <w:bookmarkStart w:id="10" w:name="_Ref247712443"/> 1347 # <w:r> 1348 # <w:t xml:space="preserve">Kuva </w:t> 1349 # </w:r> 1350 # <w:fldSimple w:instr=" SEQ Kuva \* ARABIC "> 1351 # <w:r> 1352 # <w:t>1</w:t> 1353 # </w:r> 1354 # </w:fldSimple> 1355 # − 1356 # <w:r w:rsidR="00876DBA"> 1357 # <w:t>Kurssijako</w:t> 1358 # </w:r> 1359 # <w:bookmarkEnd w:id="10"/> 1360 # </w:p> 1361 # 1362 styleXml = document['word/styles.xml'] 1363 1364 picParagraphs = [] 1365 picElements = document['word/document.xml'].getElementsByTagName('pic:pic') 1366 picElements += document['word/document.xml'].getElementsByTagName('w:pict') 1367 1368 for pic in picElements: 1369 picParagraphs.append(getParentParagraph(pic)) 1370 1371 if picParagraphs is None: 1372 return False 1373 1374 for p in picParagraphs: 1375 try: 1376 captionParagraph = p.nextSibling 1377 captionParagraphPpr = captionParagraph.getElementsByTagName('w:pPr')[0] 1378 captionParagraphStyleID = captionParagraphPpr.getElementsByTagName('w:pStyle')[0].getAttribute('w:val') 1379 captionParagraphStyleName = _getStyleNameById(captionParagraphStyleID, styleXml) 1380 if captionParagraphStyleName != "caption": 1381 return False 1382 except: 1383 return False 1384 1385 #Find 'SEQ' in either one of the attribute values in the paragraph or as a text node. It indicates an automatic field. 1386 if getAttributeContent(captionParagraph).find('SEQ') == -1 and \ 1387 getTextContent(captionParagraph).find('SEQ') == -1: 1388 return False 1389 1390 return True
1391
1392 -def checkStyleUsage(document, errorIdsAndPositions):
1393 '''Checks that text paragraphs are using styles and that no manual style definitions are made. 1394 1395 Goes through all paragraph-elements in a document looking for <w:pStyle>-elements. 1396 Gets the style definitions to see if there are manual changes. 1397 1398 @note: Exception: 1399 1400 Automatically generated table on contents can contain "manual" style definitions. 1401 The <w:sectPr> elements within paragraph elements are skipped also. 1402 1403 @param errorIdsAndPositions: A dict for error strings. Should contain keys 'manualChanges' and 'styleNotUsed'. 1404 1405 @return: True if nothing was found, False if even one error was found. 1406 ''' 1407 1408 paragraphs = document['word/document.xml'].getElementsByTagName('w:p') 1409 styleXml = document['word/styles.xml'] 1410 1411 for p in paragraphs: 1412 styleDefinitions = {'w:autoRedefine': None, 1413 'w:left': None, 1414 'w:right': None, 1415 'w:firstLine': None, 1416 'w:line': None, 1417 'w:before': None, 1418 'w:after': None, 1419 'w:widowControl': None, 1420 'w:jc': None, 1421 'w:sz': None, 1422 'w:ascii': None, 1423 'w:asciiTheme': None, 1424 'w:b': None, 1425 'w:i': None, 1426 'w:u': None, 1427 #'w:outline': None, 1428 #'w:numId': None, 1429 #'w:ilvl': None, 1430 #'w:lang': None, 1431 'w:keepNext': None, 1432 'w:keepLines': None, 1433 'w:pageBreakBefore': None} 1434 1435 try: 1436 style = p.getElementsByTagName('w:pStyle')[0].getAttribute('w:val') 1437 style = _getStyleNameById(style, styleXml) 1438 1439 # It seems that Word (2010) makes style definitions in document.xml when generating an automatic table of contents: 1440 if style.startswith('toc'): 1441 continue 1442 # if style != "": 1443 # print style 1444 1445 # TODO: check the empty paragraphs content to prevent false positives, are they empty text paragraphs or maybe a picture paragraph etc? 1446 # Now just leaves the paragraphs with no text be and doesn't give an error. 1447 1448 except: 1449 #pContent = getTextContent(p) 1450 pContent = _getTextFromParagraph(p) 1451 if pContent.strip() != "": 1452 #errors.append("No style used in paragraph: " + str(pContent[:25])) 1453 errorIdsAndPositions['styleNotUsed'].append(pContent[:30]) 1454 1455 try: 1456 for paragraphProperties in p.getElementsByTagName('w:pPr'): 1457 for propertyElement in paragraphProperties.childNodes: 1458 # We don't want section properties to be mixed up as manually made style definitions. 1459 if propertyElement.tagName != "w:sectPr": 1460 styleDefinitions = _getStyleDefinitions(propertyElement, styleDefinitions) 1461 for runProperties in p.getElementsByTagName('w:rPr'): 1462 styleDefinitions = _getStyleDefinitions(runProperties, styleDefinitions) 1463 1464 for key in styleDefinitions.keys(): 1465 if styleDefinitions[key] is not None: 1466 pContent = _getTextFromParagraph(p) 1467 if pContent.strip() != "": 1468 errorIdsAndPositions['manualChanges'].append(pContent[:50]) 1469 break 1470 except: 1471 continue 1472 1473 for key in errorIdsAndPositions.keys(): 1474 if len(errorIdsAndPositions[key]) > 0: 1475 return False 1476 return True
1477
1478 -def checkEndnotesAndFootnotes(document):
1479 ''' Checks if there is an endnote or a footnote in the document. 1480 1481 Looks for w:endnoteReference and w:footnoteReference elements. 1482 1483 @return: True if an endnote or a footnote is found, False otherwise. 1484 ''' 1485 docXml = document["word/document.xml"] 1486 1487 endnotes = docXml.getElementsByTagName('w:endnoteReference') 1488 footnotes = docXml.getElementsByTagName('w:footnoteReference') 1489 1490 if len(endnotes) != 0: return True 1491 if len(footnotes) != 0: return True 1492 1493 # TODO: doublecheck: find the endnote with the id in endnotes.xml. That's where the endnote text is located. 1494 # TODO: doublecheck: find the footnote in footnotes.xml 1495 1496 return False
1497
1498 -def checkCrossRefrenceToImageCaption(document):
1499 ''' Goes through images' captions looking for a reference. Then checks if the caption is referenced somewhere. 1500 1501 @return: True if a cross reference is found, False otherwise. 1502 ''' 1503 #TODO: not implemented in word_processing 13.5.2011 1504 1505 docXml = document['word/document.xml'] 1506 1507 picParagraphs = [] 1508 picElements = document['word/document.xml'].getElementsByTagName('pic:pic') 1509 picElements += document['word/document.xml'].getElementsByTagName('w:pict') 1510 1511 for pic in picElements: 1512 picParagraphs.append(getParentParagraph(pic)) 1513 1514 for p in picParagraphs: 1515 captionParagraph = p.nextSibling 1516 try: 1517 bookmarkStartElement = captionParagraph.getElementsByTagName('w:bookmarkStart')[0] 1518 except: 1519 # raise an error: errors.append('Picture\'s caption reference not found.') 1520 return False 1521 reference = bookmarkStartElement.getAttribute('w:name') 1522 1523 for element in docXml.getElementsByTagName('w:instrText'): 1524 elementText = getTextContent(element) 1525 if elementText.find(reference) != -1: 1526 return True 1527 # raise an error: errors.append('Reference to picture caption not found.') 1528 return False
1529
1530 -def _getElementByAttributeValue(nodeList, attributeName, attributeValue):
1531 '''Gets an element by an attribute value. 1532 1533 @param nodeList: A list of elements to be searched for. 1534 @param attributeName: The name of the wanted attribute. 1535 @param attributeValue: The wanted value of the attribute. 1536 1537 @return: The element, if it has an attribute with the wanted value, None otherwise. 1538 ''' 1539 for element in nodeList: 1540 if element.getAttribute(attributeName) == attributeValue: 1541 return element 1542 return None
1543
1544 -def _isStyleUsed(document, styleName):
1545 '''Checks that a style is used in the document. 1546 1547 @param styleName: The name of the style looked for. 1548 1549 @return: True if the style is used, False otherwise. 1550 ''' 1551 docXml = document['word/document.xml'] 1552 styleXml = document['word/styles.xml'] 1553 1554 styleId = _getStyleIdByName(styleName, styleXml) 1555 bodyParagraphs = _getParagraphElementsByStyleId(docXml, styleId) 1556 1557 if len(bodyParagraphs) > 0: 1558 return True 1559 return False
1560 1561 #def checkHeadingUsage(document): 1562 # '''Check if heading styles are used in the document. 1563 # 1564 # @return: True if heading styles are used, False otherwise. 1565 # ''' 1566 # docXml = document['word/document.xml'] 1567 # styleXml = document['word/styles.xml'] 1568 # 1569 # headingParagraphs = _getParagraphElementsBySequentialStyleName("heading", styleXml, docXml) 1570 # if len(headingParagraphs) == 0: 1571 # #errors.append("No heading styles used in this document!") 1572 # return False 1573 # 1574 # usedHeadingStyles = [] 1575 # for heading in headingParagraphs: 1576 # styleId = heading.getElementsByTagName('w:pStyle')[0].getAttribute('w:val') 1577 # if usedHeadingStyles.count(styleId) == 0: 1578 # usedHeadingStyles.append(str(styleId)) 1579 # 1580 # #FIXME: not the most dynamic way: 1581 # if len(usedHeadingStyles) < 2: 1582 # return False 1583 # 1584 # return True 1585
1586 -def checkHeadingNumbering(document, errorIdsAndPositions):
1587 '''Checks the headings in the document. 1588 1589 Goes through the heading styles used in the document checking that they use a multilevel numbering, 1590 the numbering is done correctly using styles and that the numbering is connected to other heading styles. 1591 1592 Gets all the heading styles used in the document. 1593 Searches for the heading's numbering definition reference in styles.xml. 1594 Next searches the associated numbering definition in numbering.xml. 1595 Next searches the correct numbering level definition associated to the heading. 1596 Checks that the numbering is multilevel and done correctly using the heading styles. 1597 1598 @note: XML example: 1599 1600 styles.xml: 1601 1602 <w:style w:type="paragraph" w:styleId="Heading2"> - Heading 2 style definition 1603 1604 <w:name w:val="heading 2"/> 1605 1606 <w:pPr> 1607 1608 <w:numPr> 1609 1610 <w:ilvl w:val="1"/> - Numbering Level Reference 1611 1612 <w:numId w:val="1"/> - Numbering Definition Instance Reference 1613 1614 </w:numPr> 1615 1616 <w:outlineLvl w:val="1"/> 1617 1618 </w:pPr> 1619 1620 </w:style> 1621 1622 numbering.xml: 1623 1624 <w:abstractNum w:abstractNumId="0"> - Abstract Numbering Definition 1625 1626 <w:multiLevelType w:val="multilevel"/> - Abstract Numbering Definition Type 1627 1628 <w:lvl w:ilvl="0"> - </w:lvl> - Numbering Level Definition 1629 1630 <w:lvl w:ilvl="1"> - Numbering Level Definition 1631 1632 <w:start w:val="1"/> - Starting Value 1633 1634 <w:numFmt w:val="decimal"/> - Numbering Format 1635 1636 <w:pStyle w:val="Heading2"/> - Paragraph Style's Associated Numbering Level 1637 1638 <w:lvlText w:val="%1.%2"/> - Numbering Level Text 1639 1640 <w:lvlJc w:val="left"/> - Justification 1641 1642 <w:pPr> - Numbering Level Associated Paragraph Properties 1643 1644 <w:ind w:left="576" w:hanging="576"/> 1645 1646 </w:pPr> 1647 1648 </w:lvl> 1649 1650 </w:abstractNum> 1651 1652 <w:num w:numId="1"> - Numbering Definition Instance 1653 1654 <w:abstractNumId w:val="0"/> - Abstract Numbering Definition Reference 1655 1656 </w:num> 1657 1658 @param errorIdsAndPositions: A dict for appending errors in key - stringlist pairs. 1659 Should contain the following keys: 1660 - 'manualNumbering' -- numbering is done manually somehow. 1661 - 'styleNotUsed' -- an expected heading style is not used. 1662 - 'differentNumbering' -- some heading style is using different numbering than some other heading styles. 1663 - 'notMultilevel' -- the numbering is not multilevel. 1664 - 'outlineLvl' -- the outline of a heading style is not correct. 1665 - 'numStart' -- the numbering doesn't start at 1. 1666 - 'numWrong' -- the numbering is somehow not done with styles. 1667 - 'numFormat' -- the numbering format is not correct. 1668 - 'notSequential' -- heading styles are not used correctly in a row for example heading 3 is used after heading 1. 1669 ''' 1670 1671 #errorIdsAndPositions = {'manualNumbering': None} 1672 docXml = document["word/document.xml"] 1673 styleXml = document["word/styles.xml"] 1674 1675 try: 1676 numXml = document['word/numbering.xml'] 1677 #numFile = zip.read('word/numbering.xml') 1678 #numXml = xml.dom.minidom.parseString(numFile) 1679 except: 1680 #errors.append("No heading numbering used at all.") 1681 return False 1682 1683 1684 headingParagraphs = _getParagraphElementsBySequentialStyleName("heading", styleXml, docXml) 1685 #@see: checkHeadingUsage 1686 #if len(headingParagraphs) == 0: 1687 # errors.append("No heading styles used in this document!") 1688 # return 1689 1690 usedHeadingsStyleIds = [] 1691 previousHeadingLevel = 0 1692 for heading in headingParagraphs: 1693 styleId = heading.getElementsByTagName('w:pStyle')[0].getAttribute('w:val') 1694 if len(heading.getElementsByTagName('w:ilvl')) > 0 or \ 1695 len(heading.getElementsByTagName('w:numId')) > 0: 1696 errorIdsAndPositions['manualNumbering'] = getTextContent(heading) 1697 # errors.append("Manual numbering definitions made in heading: " + getTextContent(heading)) 1698 if usedHeadingsStyleIds.count(styleId) == 0: 1699 usedHeadingsStyleIds.append(str(styleId)) 1700 headingLevel = int(styleId[len(styleId) - 1]) 1701 if fabs(headingLevel - previousHeadingLevel) > 1: 1702 #errors.append("Otsikoita ei ole käytetty oikealla tavalla peräkkäin.") 1703 errorIdsAndPositions['notSequential'] = getTextContent(heading) 1704 previousHeadingLevel = headingLevel 1705 1706 #Sort the list: ['Heading1', 'Heading2', 'Heading3', ...] 1707 usedHeadingsStyleIds.sort(cmp=None, key=None, reverse=False) 1708 #print usedHeadingsStyleIds 1709 1710 previousNumId = None 1711 1712 for headingStyleId in usedHeadingsStyleIds: 1713 1714 headingLevel = int(headingStyleId[len(usedHeadingsStyleIds[0]) - 1]) 1715 1716 headingStyleElement = _getStyleElementById(headingStyleId, styleXml) 1717 1718 # Get the numbering definitions of the heading style. 1719 # Default ilvl value to 0 -> ilvl-element not found (level is 0). 1720 styleDefinitions = {'w:ilvl': '0', 'w:numId': None, 'w:outlineLvl': None} 1721 styleDefinitions = _getStyleDefinitions(headingStyleElement, styleDefinitions) 1722 1723 # Chekc that the numbering style definitions are OK. 1724 if styleDefinitions['w:numId'] is None: 1725 errorIdsAndPositions['styleNotUsed'] = headingStyleId 1726 #errors.append(headingStyleId + " numbering is not used.") 1727 #return 1728 if previousNumId is not None and styleDefinitions['w:numId'] != previousNumId: 1729 errorIdsAndPositions['differentNumbering'] = headingStyleId 1730 #errors.append(headingStyleId + " is using different numbering as the previous level heading style.") 1731 previousNumId = styleDefinitions['w:numId'] 1732 if int(styleDefinitions['w:ilvl']) != headingLevel - 1: 1733 errorIdsAndPositions['notMultilevel'] = headingStyleId 1734 #errors.append(headingStyleId + " numbering level is not correct, numbering is not multilevel.") 1735 if int(styleDefinitions['w:outlineLvl']) != headingLevel - 1: 1736 errorIdsAndPositions['outlineLvl'] = headingStyleId 1737 #errors.append(headingStyleId + " outline level is not correct.") 1738 1739 # Find the numbering definition element associated to the heading style. 1740 # Get the abstract numbering definition id from the numbering definition element. 1741 # Find the abstract numbering definition element with the correct id. 1742 # Find the numbering level definition with the same level that the heading style. 1743 try: 1744 numElement = _getElementByAttributeValue(numXml.getElementsByTagName('w:num'), 'w:numId', styleDefinitions['w:numId']) 1745 abstractNumId = numElement.getElementsByTagName('w:abstractNumId')[0].getAttribute('w:val') 1746 absNumElement = _getElementByAttributeValue(numXml.getElementsByTagName('w:abstractNum'), 'w:abstractNumId', abstractNumId) 1747 lvlElement = _getElementByAttributeValue(absNumElement.getElementsByTagName('w:lvl'), 'w:ilvl', styleDefinitions['w:ilvl']) 1748 except: 1749 # errors.append(headingName + " numbering level definitions not found.") 1750 continue 1751 1752 # Get the numbering level definitions. 1753 numDefinitions = {'w:start': None, 'w:numFmt': None, 'w:pStyle': None, 'w:lvlText': None, 'w:lvlJc': None, 'w:tentative': None} 1754 numDefinitions = _getStyleDefinitions(lvlElement, numDefinitions) 1755 1756 # TODO: should we check that the numbering is in format 1, 1.1, 1.1.1 etc ? 1757 if numDefinitions['w:start'] != '1': 1758 errorIdsAndPositions['numStart'] = headingStyleId 1759 #" numbering doesn't start at number 1.") 1760 if numDefinitions['w:pStyle'] != headingStyleId: 1761 errorIdsAndPositions['numWrong'] = headingStyleId 1762 #" numbering is not done correctly using heading styles.") 1763 if numDefinitions['w:numFmt'] != "decimal": 1764 errorIdsAndPositions['numFormat'] = headingStyleId 1765 #errors.append(headingStyleId + " numbering format is not a decimal number.") 1766 1767 return True
1768
1769 -def _getParagraphElementsByStyleId(docXml, styleId):
1770 ''' Gets all paragraph-elements in the document by a style id.''' 1771 paragraphList = [] 1772 1773 for p in docXml.getElementsByTagName('w:p'): 1774 try: 1775 if styleId == p.getElementsByTagName('w:pStyle')[0].getAttribute('w:val'): 1776 paragraphList.append(p) 1777 except: 1778 continue 1779 #errors.append("No style used in paragraph: " + getTextContent(p)) 1780 return paragraphList
1781
1782 -def _getParagraphElementsBySequentialStyleName(styleNamePrefix, styleXml, docXml):
1783 ''' Return all paragraph elements that use a style name with a sequential numbering. 1784 1785 Gets all paragraphs that use styles with stylenames for example heading 1, heading 2, etc or 1786 index 1, index 2, etc. 1787 1788 @param styleNamePrefix: The prefix of the sequential style name. 1789 ''' 1790 paragraphs = [] 1791 i = 1 1792 styleNamePrefix = styleNamePrefix.strip() + " " 1793 1794 while(True): 1795 styleId = _getStyleIdByName(styleNamePrefix + str(i), styleXml) 1796 if styleId is None: 1797 break 1798 else: 1799 paragraphs += _getParagraphElementsByStyleId(docXml, styleId) 1800 i += 1 1801 return paragraphs
1802
1803 -def checkIndex(document):
1804 '''Checks that the document has an automatically made index. 1805 1806 @return: False if an index is missing, '2' if index is not automatically made and True if everything was OK. 1807 ''' 1808 docXml = document['word/document.xml'] 1809 styleXml = document['word/styles.xml'] 1810 1811 indexParagraphs = _getParagraphElementsBySequentialStyleName("index ", styleXml, docXml) 1812 if len(indexParagraphs) == 0: 1813 return False 1814 #if len(indexParagraphs) != 0: return True 1815 1816 # The previous w:p element of the first index entry should be something like this: 1817 #<w:p w:rsidR="002F2A09" w:rsidRDefault="00CA51D5"> 1818 #<w:pPr> 1819 #<w:sectPr w:rsidR="002F2A09" w:rsidSect="002F2A09"> --- </w:sectPr> 1820 #</w:pPr> 1821 #<w:r> 1822 #<w:fldChar w:fldCharType="begin"/> 1823 #</w:r> 1824 #<w:r> 1825 #<w:instrText xml:space="preserve"> INDEX \c "2" \z "1035" </w:instrText> 1826 #</w:r> 1827 #<w:r> 1828 #<w:fldChar w:fldCharType="separate"/> 1829 #</w:r> 1830 #</w:p> 1831 1832 # Search the 'instrText' field element and be sure that it's an index field element. 1833 try: 1834 indexFieldCodeElement = indexParagraphs[0].previousSibling.getElementsByTagName('w:instrText')[0] 1835 except: 1836 indexFieldCodeElement = None 1837 # There can be a section brake between the first index entry and the field declaration? 1838 if indexFieldCodeElement is None: 1839 try: 1840 indexFieldCodeElement = indexParagraphs[0].previousSibling.previousSibling.getElementsByTagName('w:instrText')[0] 1841 except: 1842 indexFieldCodeElement = None 1843 1844 if indexFieldCodeElement is None: 1845 #errors.append('Index is not a field - make the index automatically, not manually.') 1846 return '2' 1847 elif getTextContent(indexFieldCodeElement).find('INDEX') == -1: 1848 #errors.append('INDEX-text not found in field declaration - make the index automatically, not manually.') 1849 return '2' 1850 1851 return True
1852
1853 -def checkIndexContent(document):
1854 ''' Checks that the document has a index that is not empty, and that the index entries are referenced somewhere in the document. 1855 1856 First gets all the index styles' definitions from styles.xml and finds paragraphs using the styles in the document.xml. 1857 Checks that there is a field code element indicating that the index is generated automatically. 1858 Collects the content of the index and checks it isn't empty. 1859 Finds references to the index entries and matches them to the index content. 1860 1861 @note: XML example: 1862 1863 Index example: 1864 1865 <w:p w:rsidR="002F2A09" w:rsidRDefault="00CA51D5"> 1866 1867 <w:r> 1868 1869 <w:fldChar w:fldCharType="begin"/> 1870 1871 </w:r> 1872 1873 <w:r> 1874 1875 <w:instrText xml:space="preserve"> INDEX \c "2" \z "1035" </w:instrText> 1876 1877 </w:r> 1878 1879 <w:r> 1880 1881 <w:fldChar w:fldCharType="separate"/> 1882 1883 </w:r> 1884 1885 </w:p> 1886 1887 <w:p w:rsidR="002F2A09" w:rsidRDefault="002F2A09"> 1888 1889 <w:pPr> 1890 1891 <w:pStyle w:val="Index1"/> 1892 1893 <w:tabs> 1894 1895 <w:tab w:val="right" w:leader="dot" w:pos="3950"/> 1896 1897 </w:tabs> 1898 1899 </w:pPr> 1900 1901 <w:r> 1902 1903 <w:t>Index entry level 1</w:t> 1904 1905 </w:r> 1906 1907 </w:p> 1908 1909 Reference example: 1910 1911 <w:r w:rsidR="00B27B47"> 1912 1913 <w:instrText xml:space="preserve"> XE "</w:instrText> 1914 1915 </w:r> 1916 1917 <w:r w:rsidR="00B27B47" w:rsidRPr="00B27B47"> 1918 1919 <w:instrText>Level 1 entry</w:instrText> 1920 1921 </w:r> 1922 1923 <w:r w:rsidR="00B27B47" w:rsidRPr="00B27B47"> 1924 1925 <w:instrText>:</w:instrText> 1926 1927 </w:r> 1928 1929 <w:r w:rsidR="00B27B47" w:rsidRPr="0011587C"> 1930 1931 <w:instrText>Level 2 entry</w:instrText> 1932 1933 </w:r> 1934 1935 @return: '3' if the index is empty, '4' if the content does not match with the document and True if everything went OK. 1936 ''' 1937 docXml = document['word/document.xml'] 1938 styleXml = document['word/styles.xml'] 1939 1940 indexParagraphs = _getParagraphElementsBySequentialStyleName("index ", styleXml, docXml) 1941 indexTextContent = dict() 1942 for p in indexParagraphs: 1943 textContent = getTextContent(p) 1944 if textContent is not None and textContent != "": 1945 indexTextContent[textContent] = None 1946 if len(indexTextContent) == 0: 1947 #errors.append('Index is empty.') 1948 return '3' 1949 1950 documentFieldTexts = "" 1951 for pElement in docXml.getElementsByTagName('w:instrText'): 1952 documentFieldTexts += getTextContent(pElement) 1953 1954 # Index entry reference example: 'XE "MainEntry"', 'XE "MainEntry:SubEntry"' or even 'XE "MainEntry:Heading" "Subentry:Heading"' 1955 # Check that the entries are actually included in the index: 1956 # Parse the string containing all text content of the w:instrText-elements. 1957 # First split at XE_ (where _ is whitespace), next split at \" and finally split at \:. 1958 # Compare the final index entry candidate to the index entries and visa versa to see if they match. 1959 # If finally some index entry doesn't have any matches, the entry is probably made manually. It's referenced nowhere! 1960 # TODO: check that the reference makes actually sense? Page number to the index comes from the page where the reference is. 1961 1962 indexReferenceFieldsContent = [] 1963 for field in documentFieldTexts.split('XE '): 1964 for candidate in field.split('\"'): 1965 for finalCandidate in candidate.split(":"): 1966 if finalCandidate.strip() != "": 1967 indexReferenceFieldsContent.append(finalCandidate) 1968 1969 1970 for indexReferenceComponent in indexReferenceFieldsContent: 1971 #print indexReferenceComponent 1972 for key in indexTextContent.keys(): 1973 if key.find(indexReferenceComponent) != -1: 1974 indexTextContent[key] = True 1975 break 1976 elif indexReferenceComponent.find(key) != -1: 1977 indexTextContent[key] = True 1978 break 1979 1980 for key in indexTextContent.keys(): 1981 #print key + " - " + str(indexTextContent[key]) 1982 if indexTextContent[key] is None: 1983 #errors.append('No references found for index entry ' + key + '.') 1984 return '4' 1985 return True
1986
1987 -def checkDoubleWhitespaces(document):
1988 '''Checks double whitespaces in the document. 1989 1990 @return: The amount of occurrences of the double whitespaces found in the document, False otherwise. 1991 ''' 1992 return checkStringFromDocument(document['word/document.xml'], ' ')
1993
1994 -def checkAsterisk(document):
1995 '''Checks the *-character in the document. 1996 1997 @return: The amount of occurrences of the asterisks found in the document, False otherwise. 1998 ''' 1999 return checkStringFromDocument(document['word/document.xml'], '*')
2000
2001 -def checkStringFromDocument(docXml, string):
2002 '''Checks if a string is found in the text content of the document (in the w:t-elements). 2003 If string is found, returns how many occurences were found in a paragraph. 2004 2005 @return: The amount of occurrences of the string is found in the document, False otherwise. 2006 ''' 2007 found = False 2008 count = 0 2009 for p in docXml.getElementsByTagName('w:p'): 2010 textContent = "" 2011 for textElement in p.getElementsByTagName('w:t'): 2012 textContent += getTextContent(textElement) 2013 occurrences = textContent.count(string) 2014 if occurrences > 0: 2015 count += occurrences 2016 # errors.append("\"" + string + "\" occurs " + str(occurrences) + " time(s) in paragraph: " + textContent[:25]) 2017 found = True 2018 if found is True: 2019 return count 2020 return found
2021 # for p in docXml.getElementsByTagName('w:p'): 2022 # if(checkStringFromContent(p, " ")): 2023 # errors.append("Double whitespace in paragraph: " + getTextContent(p)) 2024
2025 -def checkTabs(document):
2026 '''Checks if the tabulator is used in the document. 2027 2028 @note: Exceptions: 2029 2030 - automatically generated table of contents and index contain tabulators. 2031 2032 - before an automatically generated index there is a paragraph-element with <instrText>-element and a <tab>-element. 2033 2034 @return: The amount of the tabulator occurrences found in the document, False if none was found. 2035 ''' 2036 #TODO: More exceptions? 2037 styleXml = document['word/styles.xml'] 2038 tabParagraphs = document['word/document.xml'].getElementsByTagName('w:tab') 2039 tabCount = 0 2040 2041 if len(tabParagraphs) == 0: 2042 return False 2043 2044 #tabParagraphContent = [] 2045 #tabParagraphContent = dict() 2046 2047 for tab in tabParagraphs: 2048 tabParent = getParentParagraph(tab, 'w:p') 2049 try: 2050 tabParentStyleId = tabParent.getElementsByTagName('w:pStyle')[0].getAttribute('w:val') 2051 except: 2052 continue 2053 if _getStyleNameById(tabParentStyleId, styleXml).startswith('toc') or \ 2054 _getStyleNameById(tabParentStyleId, styleXml).startswith('index'): 2055 continue 2056 else: 2057 if getTextContent(tabParent).find('INDEX') != -1: 2058 continue 2059 else: 2060 2061 tabCount += 1 2062 # print getTextContent(tabParent) 2063 #tabParagraphContent.append(getTextContent(tabParent)) 2064 # try: 2065 # tabParagraphContent[getTextContent(tabParent)] += 1 2066 # except KeyError: 2067 # tabParagraphContent[getTextContent(tabParent)] = 1 2068 # 2069 # if len(tabParagraphContent) == 0: 2070 if tabCount == 0: 2071 return False 2072 2073 return tabCount
2074
2075 -def isParagraphEmpty(p, styleXml):
2076 '''Checks if a paragraph is empty. 2077 2078 @note: Expections: 2079 2080 Picture in the document produces an empty paragraph. 2081 Empty table cell produces an empty paragraph. 2082 A table produces an empty paragraph right after the table. 2083 Objects and graphics produce an empty paragraph. 2084 ... 2085 2086 @param p: The paragraph element under inspection. 2087 2088 @return: False if the paragraph is not empty, True if it is empty. 2089 ''' 2090 #FIXME: these are surely not the only exceptions. Add more exceptions. 2091 pContent = _getTextFromParagraph(p).strip() 2092 if len(pContent) == 0: 2093 if len(p.getElementsByTagName('pic:pic')) > 0: 2094 return False 2095 if len(p.getElementsByTagName('w:sectPr')) > 0: 2096 return False 2097 if len(p.getElementsByTagName('w:pict')) > 0: 2098 return False 2099 if len(p.getElementsByTagName('w:object')) > 0: 2100 return False 2101 if len(p.getElementsByTagName('a:graphic')) > 0: 2102 return False 2103 if getParentParagraph(p, 'w:tbl') is not None: 2104 return False 2105 2106 #TODO: try-except on previousSiblings 2107 if p.previousSibling is not None: 2108 if p.previousSibling.tagName == 'w:tbl': 2109 return False 2110 styleId = _getParagraphStyleId(p.previousSibling) 2111 if styleId is not None: 2112 styleName = _getStyleNameById(styleId, styleXml) 2113 if styleName is not None: 2114 if styleName.find('toc') != -1: 2115 return False 2116 if p.previousSibling is not None: 2117 styleId = _getParagraphStyleId(p.previousSibling) 2118 if styleId is not None: 2119 styleName = _getStyleNameById(styleId, styleXml) 2120 if styleName is not None: 2121 if styleName.find('index') != -1: 2122 return False 2123 if p.previousSibling.previousSibling is not None: 2124 styleId = _getParagraphStyleId(p.previousSibling.previousSibling) 2125 if styleId is not None: 2126 styleName = _getStyleNameById(styleId, styleXml) 2127 if styleName is not None: 2128 if styleName.find('index') != -1: 2129 return False 2130 2131 #print getTextContent(p) 2132 #print getTextContent(p.previousSibling) 2133 #print getTextContent(p.previousSibling.previousSibling) 2134 return True 2135 return False
2136
2137 -def checkEmptyParagraphs(document):
2138 ''' Finds all empty paragraphs in the document. 2139 2140 @note: Expections: 2141 2142 Picture in the document produces an empty paragraph. 2143 Empty table cell produces an empty paragraph. 2144 A table produces an empty paragraph right after the table. 2145 ...? 2146 2147 @return: amount of empty paragraph occurrences in the document, False if none was found. 2148 ''' 2149 paragraphs = document['word/document.xml'].getElementsByTagName('w:p') 2150 #emptyParagraphs = dict() 2151 emptyParagraphsCount = 0 2152 2153 for p in paragraphs: 2154 result = isParagraphEmpty(p, document['word/styles.xml']) 2155 2156 if result is True: 2157 emptyParagraphsCount += 1 2158 2159 if emptyParagraphsCount == 0: 2160 return False 2161 else: 2162 return emptyParagraphsCount
2163
2164 -def checkList(document, listName='List'):
2165 ''' Goes through all paragraph elements in the document looking for paragraphs that use some list style. 2166 2167 @param listName: The list stylename we want to check. Defaults to 'List', 2168 which finds list styles such as 'List', 'List Bullet', 'List Numbered'. 2169 2170 @return: True, if a list style is used in the document, False otherwise. 2171 ''' 2172 docXml = document['word/document.xml'] 2173 styleXml = document['word/styles.xml'] 2174 2175 for p in docXml.getElementsByTagName('w:p'): 2176 styleId = _getParagraphStyleId(p) 2177 if styleId is not None: 2178 styleName = _getStyleNameById(styleId, styleXml) 2179 if styleName.find(listName) != -1: 2180 return True 2181 2182 return False
2183
2184 -def checkSpreadsheetChart(document):
2185 '''Checks that the document has a chart copied from a spreadsheet document. 2186 The Chart must be pasted as a link. 2187 ''' 2188 2189 #TODO: not implemented in word_processing 13.5.2011 2190 docXml = document['word/document.xml'] 2191 docRelsXml = document['word/_rels/document.xml.rels'] 2192 2193 objectElements = docXml.getElementsByTagName('w:object') 2194 if len(objectElements) == 0: 2195 return False 2196 2197 for objectElement in objectElements: 2198 if len(objectElement.getElementsByTagName('v:formulas')) > 0: 2199 2200 try: 2201 OLEObjectElement = objectElement.getElementsByTagName('o:OLEObject')[0] 2202 except: 2203 continue 2204 2205 if OLEObjectElement.getAttribute('ProgID').find('Excel') != -1: 2206 #print "Spreadsheet chart is not from Excel." 2207 #print OLEObjectElement.getAttribute('Type') 2208 if OLEObjectElement.getAttribute('Type') == 'Link': 2209 2210 rid = OLEObjectElement.getAttribute('r:id') 2211 target = getRelsTargetByRId(rid, docRelsXml) 2212 targetChart = target.split("!") 2213 targetChart.reverse() 2214 #Example: targetChart[0] = %5bmalli.xlsx%5dmalli%20Chart%201 2215 targetChartName = targetChart[0] 2216 2217 #TODO: more effective examination required, just check that there is more than three %-characters. 2218 if targetChartName.count('%') < 3: 2219 return False 2220 2221 #print targetChartName 2222 2223 2224 return True 2225 else: 2226 return "Spreadsheet object is not pasted as a link." 2227 return False
2228
2229 -def checkSpreadsheetTable(document):
2230 '''Checks that the document has a table copied from a spreadsheet document. 2231 For now checks that the table is pasted as a link. 2232 ''' 2233 docXml = document['word/document.xml'] 2234 docRelsXml = document['word/_rels/document.xml.rels'] 2235 2236 objectElements = docXml.getElementsByTagName('w:object') 2237 if len(objectElements) == 0: 2238 return False 2239 2240 for objectElement in objectElements: 2241 if len(objectElement.getElementsByTagName('v:formulas')) == 0: 2242 2243 try: 2244 OLEObjectElement = objectElement.getElementsByTagName('o:OLEObject')[0] 2245 except: 2246 continue 2247 2248 if OLEObjectElement.getAttribute('ProgID').find('Excel') != -1: 2249 #print "Spreadsheet chart is not from Excel." 2250 #print OLEObjectElement.getAttribute('Type') 2251 if OLEObjectElement.getAttribute('Type') == 'Link': 2252 rid = OLEObjectElement.getAttribute('r:id') 2253 target = getRelsTargetByRId(rid, docRelsXml) 2254 targetChart = target.split("!") 2255 targetChart.reverse() 2256 targetTableCells = targetChart[0] 2257 2258 #Example: targetTableCells = R1C1:R7C4 2259 #TODO: more effective examination might be required, just check that 2 R- and 2 C-characters are found. 2260 if targetTableCells.count('R') != 2: 2261 return False 2262 if targetTableCells.count('C') != 2: 2263 return False 2264 #print targetTableCells 2265 2266 return True 2267 else: 2268 return "Spreadsheet object is not pasted as a link." 2269 return False
2270
2271 -def checkPresentationGraphicsChart(document):
2272 '''Checks that the document contains a chart pasted from PowerPoint as a vector graphics picture or as an object. 2273 Doesn't really know if the picture or object is actually from PowerPoint! 2274 ''' 2275 2276 #TODO: not implemented in word_processing 13.5.2011 2277 2278 docRelsXml = document['word/_rels/document.xml.rels'] 2279 2280 pictureTargets = getImagePaths(docRelsXml) 2281 for target in pictureTargets: 2282 if target.endswith('.emf') is True: 2283 # .emf vector graphics picture was found. 2284 return True 2285 if target.endswith('wmf') is True: 2286 return True 2287 2288 2289 # TODO: check the object if no vector graphics picture is found. 2290 2291 # Normal jpg or png pictures may be inside the w:drawing-element. 2292 #drawingElements = doc.getElementsByTagName('w:drawing') 2293 #if len(drawingElements) > 0: 2294 # return True 2295 2296 return False
2297