Package src :: Package inspectors :: Module docx_inspector
Source Code for Module src.inspectors.docx_inspector

   1  #!/usr/bin/python 
   2  # -*- coding: UTF-8 -*- 
   3  # 
   4  #The MIT License 
   5  # 
   6  #Copyright (c) 2011 
   7  # 
   8  #Permission is hereby granted, free of charge, to any person obtaining a copy 
   9  #of this software and associated documentation files (the "Software"), to deal 
  10  #in the Software without restriction, including without limitation the rights 
  11  #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 
  12  #copies of the Software, and to permit persons to whom the Software is 
  13  #furnished to do so, subject to the following conditions: 
  14  # 
  15  #The above copyright notice and this permission notice shall be included in 
  16  #all copies or substantial portions of the Software. 
  17  # 
  18  #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
  19  #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
  20  #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
  21  #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
  22  #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
  23  #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
  24  #THE SOFTWARE. 
  25  # 
  26  #Authors: 
  27  #   Vili Auvinen (vili.k.auvinen@jyu.fi) 
  28  #   Olli Kauppinen (olli.kauppinen@jyu.fi) 
  29  #   Juho Tammela (juho.i.tammela@jyu.fi) 
  30   
  31  ''' 
  32  The module provides the methods for inspecting docx files. 
  33   
  34  @author: Vili Auvinen, Juho Tammela 
  35  ''' 
  36   
  37  from common_methods import * 
  38  from math import fabs 
  39  from conversions import convertTwipToCm, convertTwipToPt 
  40   
  41 -def _getStyleElementById(styleId, styleXml): 
  42      '''Gets a style element by the style id from the styles.xml. 
  43       
  44      The style Id links a paragraph using a style in document.xml to the right style in styles.xml. 
  45      Style id can be in different languages depending on what language the Word was that wrote the document. 
  46       
  47      @note: XML example:  
  48       
  49      <w:p> 
  50      <w:pPr> 
  51      <w:pStyle w:val="Otsikko1"/> (This is the style id.) 
  52      </w:pPr> 
  53      <r> 
  54      ... 
  55      </r> 
  56      </w:p> 
  57       
  58       
  59      @param styleId: The style id. 
  60      @param styleXml: styles.xml as a DOM tree. 
  61       
  62      @return: The style element with a given style id or None if no matching style element was found. 
  63      ''' 
  64      
  65      #styleElements = _getElementsWithinElement(styleXml, 'w:style') 
  66      styleElements = styleXml.getElementsByTagName('w:style') 
  67      for element in styleElements: 
  68          if (element.getAttribute('w:styleId') == styleId): 
  69              return element 
  70       
  71      #errors.append(styleId + ' -style id is not found.') 
  72      return None 
  73               
  74 -def _getStyleElementByName(styleName, styleXml): 
  75      '''Gets a style element by a style name from styles.xml. 
  76       
  77      Style name is found in the styles.xml. 
  78      A style has always the same name regardless of the language of the Word that wrote the document. 
  79       
  80      @note: XML example: 
  81       
  82      <w:style w:type="paragraph" w:styleId="Otsikko1"> (This is the style id.) 
  83      <w:name w:val="heading 1"/> (Here is the style name.) 
  84      ... 
  85      </w:style> 
  86       
  87      @param styleName: The style name.  
  88      @param styleXml: styles.xml as a DOM tree. 
  89       
  90      @return: The style-element with a given style name, or None if no matching style element was found. 
  91      ''' 
  92      #nameElements = _getElementsWithinElement(styleXml, 'w:name') # w:name -element is style-element's child 
  93      nameElements = styleXml.getElementsByTagName('w:name') 
  94      for element in nameElements: 
  95          if (element.getAttribute('w:val') == styleName): 
  96              return element.parentNode 
  97      return None 
  98   
  99 -def _getBasedOnStyleId (styleName, styleXml): 
 100      '''Get the based-on style style id for a given style from styles.xml. 
 101       
 102      @param styleName: The style name of the style that's based-on style id is wanted. 
 103      @param styleXml: styles.xml as a DOM tree. 
 104       
 105      @return: The id of the based on style for a given style name, or None if there was no based on style.''' 
 106      # FIXME: returns actually the styleId of the basedOn-style NOT the styleName. 
 107      if _getStyleElementByName(styleName, styleXml) is not None: 
 108          try: 
 109              return _getStyleElementByName(styleName, styleXml).getElementsByTagName('w:basedOn')[0].getAttribute('w:val') 
 110          except: 
 111              return None 
 112      return None 
 113   
 114 -def _getStyleName(styleElement): 
 115      '''Get a style name of the style element. 
 116       
 117      @param styleElement: The style element whose style name is wanted. 
 118       
 119      @return: The style name of a given style element.''' 
 120      # TODO: try except 
 121      return styleElement.getElementsByTagName('w:name')[0].getAttribute('w:val') 
 122   
 123 -def _getStyleNameById(styleId, styleXml): 
 124      '''Get the name of a style with a given style id. 
 125       
 126      @param styleId: The style id to be looked for. 
 127      @param styleXml: styles.xml as a DOM tree. 
 128       
 129      @return: The style name of the style with the correct style id, or None if it wasn't found. 
 130      ''' 
 131      styleElement = _getStyleElementById(styleId, styleXml) 
 132      if styleElement is not None: 
 133          return _getStyleName(styleElement) 
 134      return None 
 135   
 136 -def _getStyleIdByName(styleName, styleXml): 
 137      '''Get the id of a style with a given style name from styles.xml. 
 138       
 139      @param styleName: The style name to be looked for. 
 140      @param styleXml: styles.xml as a DOM tree. 
 141       
 142      @return: The style id of the style with the correct style name, or None if it wasn't found. 
 143      ''' 
 144      #styleElements = _getElementsWithinElement('w:name', styleXml) 
 145      styleElements = styleXml.getElementsByTagName('w:name') 
 146      for styleElement in styleElements: 
 147          if (styleElement.getAttribute('w:val').lower() == styleName.lower()): 
 148              styleId = styleElement.parentNode.getAttribute('w:styleId') 
 149              return styleId 
 150      return None 
 151   
 152 -def _getParagraphStyleId(p): 
 153      '''Gets the style id of a paragraph element. 
 154       
 155      @param p: The paragraph element. 
 156       
 157      @return: The style id if it was found, otherwise returns None. 
 158      ''' 
 159      try: 
 160          styleId = p.getElementsByTagName('w:pStyle')[0].getAttribute('w:val') 
 161      except: 
 162          return None 
 163      return styleId 
 164   
 165   
 166 -def _getThemeFont(themeXml, styleDefinitions, themeFont): 
 167      '''Gets a themefont from theme1.xml. 
 168       
 169      @note: XML example: 
 170       
 171      <a:fontScheme name="Office"> 
 172      <a:majorFont> 
 173      <a:latin typeface="Cambria"/> 
 174      <a:ea typeface=""/> 
 175      <a:cs typeface=""/> 
 176      </a:majorFont> 
 177      <a:minorFont> 
 178      <a:latin typeface="Calibri"/> 
 179      <a:ea typeface=""/> 
 180      <a:cs typeface=""/> 
 181      </a:minorFont> 
 182      </a:fontScheme> 
 183       
 184      @param themeXml: theme1.xml as DOM tree. 
 185      @param styleDefinitions: The style definitions dict.  
 186      @see: _getCompleteStyleDefinitions. 
 187      @param themeFont: The theme font. Should be either 'majorFont' or 'minorFont'. 
 188       
 189      @return: The style definitions with or without changes.     
 190     
 191      ''' 
 192      if themeFont == "" or themeFont is None: 
 193          return styleDefinitions 
 194       
 195      fontElement = None 
 196       
 197      if themeFont.startswith('major'): 
 198          fontElement = themeXml.getElementsByTagName('a:majorFont')[0] 
 199      elif themeFont.startswith('minor'): 
 200          fontElement = themeXml.getElementsByTagName('a:minorFont')[0] 
 201           
 202      if fontElement is not None: 
 203          themeFont = fontElement.getElementsByTagName('a:latin')[0].getAttribute('typeface') 
 204          if themeFont.strip() != "": 
 205              styleDefinitions['w:ascii'] = themeFont 
 206              styleDefinitions['w:asciiTheme'] = None 
 207       
 208      return styleDefinitions 
 209   
 210 -def _getStyleDefinitions (element, styleDefinitions): 
 211      ''' Return style definitions of a given element. 
 212      First checks if the element has any children and uses recursion if some are found. 
 213      Next checks if the element has attributes. 
 214        - If the attribute name is a key in the dict, stores the value of the attribute. 
 215        - If the attribute name is 'w:val' and the element tag name is a key in the dict, stores the value of the attribute. 
 216      If the element tag name is a key in the dict and the element doesn't have any attributes or children, stores value '1' in the dict. 
 217       
 218      @param element: Style definitions are searched inside this element 
 219      @param styleDefinitions: A dict where the style definitions are stored.  
 220                               May contain tag names or attribute names and some default values. 
 221       
 222      @return: The style definitions in a dict. 
 223      ''' 
 224       
 225  #    <w:pPr> 
 226  #        <w:pBdr> 
 227  #            <w:top w:val="single" w:sz="24" w:space="0" w:color="4F81BD"/> 
 228  #            <w:left w:val="single" w:sz="24" w:space="0" w:color="4F81BD"/> 
 229  #            <w:bottom w:val="single" w:sz="24" w:space="0" w:color="4F81BD"/> 
 230  #            <w:right w:val="single" w:sz="24" w:space="0" w:color="4F81BD"/> 
 231  #        </w:pBdr> 
 232  #        <w:shd w:val="clear" w:color="auto" w:fill="4F81BD"/> 
 233  #        <w:spacing w:before="360" w:after="0"/> 
 234  #        <w:outlineLvl w:val="0"/> 
 235  #    </w:pPr> 
 236  # We are not interested in border style definitions as above at the moment. Implement here if needed later. 
 237      if element.tagName == "w:pBdr": 
 238          return styleDefinitions 
 239       
 240      for child in element.childNodes: 
 241          if child.nodeType != child.TEXT_NODE: 
 242              styleDefinitions = _getStyleDefinitions(child, styleDefinitions) 
 243       
 244      if element.hasAttributes(): 
 245          for i in range (0, element.attributes.length): 
 246              attributeName = element.attributes.item(i).name 
 247              if styleDefinitions.has_key(attributeName): 
 248                  styleDefinitions[attributeName] = element.attributes.item(i).value 
 249  #            if attributeName != "w:val": 
 250  #                styleDefinitions[element.attributes.item(i).name] = element.attributes.item(i).value 
 251              elif styleDefinitions.has_key(element.tagName) and attributeName == "w:val": 
 252                  styleDefinitions[element.tagName] = element.attributes.item(i).value 
 253       
 254      elif styleDefinitions.has_key(element.tagName) and element.hasChildNodes() is False: 
 255          styleDefinitions[element.tagName] = True 
 256           
 257      return styleDefinitions 
 258   
 259   
 260 -def getStyle(document, requirementStyleName): 
 261      '''Gets all definitions of a style from document dictionary. 
 262       
 263      Converts twips to centimeters. 
 264       
 265      @return: A dict with all the style definitions of the one style with the 
 266               translated keys to match return value odt_inspector's getStyle(). False, if the style was not found. 
 267      ''' 
 268      styleXml = document['word/styles.xml'] 
 269      themeXml = document['word/theme/theme1.xml'] 
 270       
 271       
 272      if _isStyleUsed(document, requirementStyleName) is False: 
 273          return False 
 274       
 275      #styleName is a capitalized string or CamelCase string (for example: Normal, Body Text) 
 276      #However, heading stylenames are in the xml in lower case (heading 1) 
 277      #Also stylenames such as toc 1, index 1, footer, header, caption are in lower case. 
 278       
 279      #The following fixes this broblem, as it gets the styleId by comparing lowercase stylename strings,  
 280      #and then gets the correct styleName by the styleId. 
 281      styleId = _getStyleIdByName(requirementStyleName, styleXml) 
 282      styleName = _getStyleNameById(styleId, styleXml) 
 283       
 284      styleDefinitions = _getCompleteStyleDefinitions(styleXml, styleName, themeXml) 
 285       
 286      if styleDefinitions is None: 
 287          return False 
 288       
 289      translateDict = {'w:name':'styleName', 
 290          'w:ascii':'fontName', 
 291          'w:sz':'fontSize', 
 292          'w:caps':'transform', 
 293          'w:left':'indentLeft', 
 294          'w:right':'indentRight', 
 295          'w:firstLine':'indentFirstLine', 
 296          'w:line':'linespacing', 
 297          'w:before':'spacingBefore', 
 298          'w:after':'spacingAfter', 
 299          'w:keepNext':'keepWithNext', 
 300          'w:jc':'alignment', 
 301          'w:widowControl':'widowControl', 
 302          #'w:widowControl':'widows', #TODO: widows kovakoodattuna 
 303          'w:b':'bold', 
 304          'w:i':'italic'} 
 305      styleDict = {} 
 306       
 307      # In case styleName is changed, let's return the same styleName as was given in the parameter. 
 308      styleDefinitions['w:name'] = requirementStyleName 
 309      # Line spacing single (1) = 12 points = 240 twips. 
 310      # <w:spacing w:before="300" w:after="340" w:line="240" w:lineRule="auto"/> 
 311      # For example: w:line="360" -> line spacing is 1.5 
 312      styleDefinitions['w:line'] = float(styleDefinitions['w:line']) / float(240.0) 
 313       
 314      styleDefinitions['w:sz'] = round(float(styleDefinitions['w:sz']) / 2, 1) # nyt voi olla 13.5 
 315      styleDefinitions['w:before'] = convertTwipToPt(float(styleDefinitions['w:before'])) 
 316      styleDefinitions['w:after'] = convertTwipToPt(float(styleDefinitions['w:after'])) 
 317  #FIXME:     
 318  #    File "/var/www/virtual.hosts/sovellusprojektit.it.jyu.fi/parsi/sovellus/docx_inspector.py", line 244, in getStyle 
 319  #    styleDefinitions['w:left'] = round( convertTwipToCm( float(styleDefinitions['w:left']) ), 1) 
 320  #    ValueError: invalid literal for float(): single 
 321      styleDefinitions['w:left'] = round(convertTwipToCm(float(styleDefinitions['w:left'])), 1) 
 322      # Rounded with a precision of one decimal. If this is used more often, update to conversions.py. 
 323       
 324      for key in translateDict.keys(): 
 325          styleDict[translateDict[key]] = styleDefinitions[key] 
 326       
 327      return styleDict 
 328   
 329 -def _getCompleteStyleDefinitions(styleXml, styleName, themeXml): 
 330      ''' Returns the style definition of the given style from style.xml and theme1.xml. 
 331      Recursion used because the style can be based on some other style. 
 332      In addition, the base style gets style definitions from the document defaults. 
 333      Finally, some style definitions are not found in the XML file at all. These definitions use some default value which must be assumed. 
 334       
 335      @note: XML example: 
 336       
 337      <w:style w:type="paragraph" w:default="1" w:styleId="Normaali"> 
 338      <w:name w:val="Normal"/> 
 339      <w:qFormat/> 
 340      <w:rsid w:val="006B493C"/> 
 341      <w:pPr> 
 342      <w:spacing w:before="140" w:after="220" w:line="360" w:lineRule="auto"/> 
 343      <w:ind w:left="567"/> 
 344      <w:jc w:val="both"/> 
 345      </w:pPr> 
 346      <w:rPr> 
 347      <w:rFonts w:ascii="Georgia" w:hAnsi="Georgia"/> 
 348      <w:lang w:val="fi-FI"/> 
 349      </w:rPr> 
 350      </w:style> 
 351       
 352      @param styleXml: styles.xml-file as a DOM tree. 
 353      @param styleName: The name of the style (NOT the id)  
 354      @see: _getStyleElementById and _getStyleElementByName for difference. 
 355      @param themeXml: theme1.xml as DOM tree. 
 356       
 357      @return: Style definitions in a dict. 
 358      ''' 
 359       
 360  #    If fontSize is not found mentioned in xml, the user has used the default size which is 12 (*2 = 24) for text body     
 361  #    TODO: initialize dict with the default style definitions. 
 362  #    TODO: some of the values are in twips, convert to cm. 
 363       
 364  #     Complete style-element specification can be found at: http://www.schemacentral.com/sc/ooxml/e-w_style-1.html 
 365  #     Style-element content for the most part: 
 366  #     w:name [0..1]    Primary Style Name 
 367  #     w:aliases [0..1]    Alternate Style Names 
 368  #     w:basedOn [0..1]    Parent Style ID 
 369  #     w:next [0..1]    Style For Next Paragraph 
 370  #     w:link [0..1]    Linked Style Reference 
 371  #     w:autoRedefine [0..1]    Automatically Merge User Formatting Into Style Definition 
 372  #     w:hidden [0..1]    Hide Style From User Interface 
 373  #     w:semiHidden [0..1]    Hide Style From Main User Interface 
 374  #     w:unhideWhenUsed [0..1]    Remove Semi-Hidden Property When Style Is Used 
 375  #     w:qFormat [0..1]    Primary Style 
 376  #     w:locked [0..1]    Style Cannot Be Applied 
 377  #    w:pPr: - paragraph properties 
 378  #        w:pStyle [0..1]    Referenced Paragraph Style 
 379  #         w:keepNext [0..1]    Keep Paragraph With Next Paragraph 
 380  #         w:keepLines [0..1]    Keep All Lines On One Page 
 381  #         w:pageBreakBefore [0..1]    Start Paragraph on Next Page 
 382  #         w:widowControl [0..1]    Allow First/Last Line to Display on a Separate Page 
 383  #         w:numPr [0..1]    Numbering Definition Instance Reference 
 384  #             w:numPr/w:numId [0..1]    Numbering Definition Instance Reference 
 385  #         w:spacing [0..1]    Spacing Between Lines and Above/Below Paragraph 
 386  #            w:before    [0..1]    Spacing Above Paragraph     
 387  #            w:beforeLines    [0..1]    Spacing Above Paragraph IN Line Units     
 388  #            w:beforeAutospacing    [0..1]    Automatically Determine Spacing Above Paragraph     
 389  #            w:after    [0..1]    Spacing Below Paragraph     
 390  #            w:afterLines    [0..1]    Spacing Below Paragraph in Line Units     
 391  #            w:afterAutospacing    [0..1]    Automatically Determine Spacing Below Paragraph     
 392  #            w:line    [0..1]    Spacing Between Lines in Paragraph     
 393  #            w:lineRule    [0..1]    Type of Spacing Between Lines 
 394  #         w:ind [0..1]    Paragraph Indentation 
 395  #            w:left    [0..1]    Left Indentation     
 396  #            w:leftChars    [0..1]    Left Indentation in Character Units     
 397  #            w:right    [0..1]   Right Indentation     
 398  #            w:rightChars    [0..1]    Right Indentation in Character Units     
 399  #            w:hanging    [0..1]    Indentation Removed from First Line     
 400  #            w:hangingChars    [0..1] Indentation Removed From First Line in Character Units     
 401  #            w:firstLine    [0..1] Additional First Line Indentation     
 402  #            w:firstLineChars    [0..1] Additional First Line Indentation in Character Units 
 403  #         w:jc [0..1]    Paragraph Alignment 
 404  #         w:outlineLvl [0..1]    Associated Outline Level 
 405  #    w:rPr - run properties 
 406  #        2. w:rFonts [0..1]    Run Fonts 
 407  #                w:ascii    [0..1]    w:ST_String    ASCII Font     
 408  #                w:hAnsi    [0..1]    w:ST_String    High ANSI Font 
 409  #                w:cs    [0..1]    w:ST_String    Complex Script Font     
 410  #                w:asciiTheme    [0..1]    w:ST_Theme    ASCII Theme Font     
 411  #                w:hAnsiTheme    [0..1]    w:ST_Theme    High ANSI Theme Font 
 412  #                w:cstheme    [0..1]    w:ST_Theme    Complex Script Theme Font 
 413  #         3. w:b [0..1]    Bold 
 414  #         4. w:bCs [0..1]    Complex Script Bold 
 415  #         5. w:i [0..1]    Italics 
 416  #         6. w:iCs [0..1]    Complex Script Italics 
 417  #         7. w:caps [0..1]    Display All Characters As Capital Letters 
 418  #         8. w:smallCaps [0..1]    Small Caps 
 419  #         9. w:strike [0..1]    Single Strikethrough 
 420  #        10. w:dstrike [0..1]    Double Strikethrough 
 421  #        11. w:outline [0..1]    Display Character Outline 
 422  #        15. w:noProof [0..1]    Do Not Check Spelling or Grammar 
 423  #        24. w:sz [0..1]    Font Size 
 424  #        25. w:szCs [0..1]    Complex Script Font Size 
 425  #        27. w:u [0..1]    Underline 
 426  #        36. w:lang [0..1]    Languages for Run Content 
 427               
 428  #     Initialize the styleDefinitions dict with the style definitions you wish to search and their default values. 
 429  #     Rules of initialization: 
 430  #     1. Use attribute name as a key. 
 431  #     2. If attribute name is 'w:val', use element tag name as a key. 
 432  #     3. If the element can be empty and attributes are optional, use element tag name as a key. 
 433      styleDefinitions = {'w:name': None, 
 434                         'w:basedOn': None, 
 435                         'w:next': None, 
 436                         'w:link': None, 
 437                         'w:autoRedefine': None, 
 438                         'w:left': '0', 
 439                         'w:right': '0', 
 440                         'w:firstLine': None, 
 441                         'w:line': '240.0', 
 442                         'w:before': '0', 
 443                         'w:after': '0', 
 444                         'w:widowControl': True, # on as default 
 445                         'w:jc': 'left', #left as default 
 446                         'w:sz': '24', 
 447                         'w:ascii': None, 
 448                         'w:asciiTheme': None, 
 449                         'w:b': False, # default 
 450                         'w:i': False, # default 
 451                         'w:u': False, # default 
 452                         #'w:outline': None, 
 453                         #'w:numId': None, 
 454                         #'w:ilvl': None, 
 455                         'w:lang': None, 
 456                         'w:keepNext': None, 
 457                         'w:keepLines': None, 
 458                         'w:pageBreakBefore': None, 
 459                         'w:caps': None} 
 460   
 461      styleElement = _getStyleElementByName(styleName, styleXml) 
 462       
 463      if styleElement is None: 
 464          return 
 465           
 466      basedOnStyleId = _getBasedOnStyleId(styleName, styleXml) 
 467   
 468  #     Recursion: if this style has a basedOn-style, method calls itself with the basedOnStyleName.  
 469  #     When style has no basedOn-style, get the definitions in w:docDefaults-element first. 
 470      if basedOnStyleId is not None: 
 471          styleDefinitions = _getCompleteStyleDefinitions(styleXml, _getStyleNameById(basedOnStyleId, styleXml), themeXml) 
 472      elif basedOnStyleId is None: 
 473          styleDefinitions = _getStyleDefinitions(styleXml.getElementsByTagName('w:docDefaults')[0], styleDefinitions) 
 474       
 475      styleDefinitions = _getStyleDefinitions(styleElement, styleDefinitions) 
 476       
 477      if(styleDefinitions.has_key('w:asciiTheme')): 
 478          styleDefinitions = _getThemeFont(themeXml, styleDefinitions, styleDefinitions['w:asciiTheme']) 
 479       
 480      return styleDefinitions 
 481   
 482 -def _getElementValueWithinElement(elementTagName, element): 
 483      '''Gets the text content of the first element with a certain tag in the given DOM tree. 
 484       
 485      @return: The text value of the element, or None if something went wrong. 
 486      ''' 
 487      try: 
 488          value = element.getElementsByTagName(elementTagName)[0].firstChild.nodeValue 
 489          return value 
 490      except: 
 491          return None 
 492   
 493 -def _getElementWithinElement(element, elementTagName): 
 494      '''Gets the first child of an element with the given tag name. 
 495       
 496      Returns the element of a the given parent element with the given elementTagName.  
 497       
 498      @param element: The element whose children are searched. 
 499      @param elementTagName: The tag name of the wanted element. 
 500       
 501      @return: An element with the right tag name, or None if it wasn't found. 
 502      ''' 
 503      try: 
 504          value = element.getElementsByTagName(elementTagName)[0] 
 505      except IndexError: 
 506          return None 
 507      return value 
 508   
 509 -def _getElementsWithinElement(element, elementTagName): 
 510      '''Gets the children of an element with the given tag name. 
 511       
 512      Returns the element of the given parent element by the given elementTagName.  
 513       
 514      @param element: The element whose children are searched. 
 515      @param elementTagName: Tag name of the wanted elements. 
 516       
 517      @return: The list of elements with the right tag name, or None if none was found. 
 518      ''' 
 519      elements = element.getElementsByTagName(elementTagName) 
 520      if len(elements) == 0: 
 521          return None 
 522      return elements 
 523   
 524  #def _getElementsWithinElement(elementTagName, xmlData): 
 525  #    ''' Returns the elements of the given xml file if xmlData and the given elementTagName  
 526  #        exist in the xmlData. 
 527  #        
 528  #       xmlData - the given xml file''' 
 529  #    elements = xmlData.getElementsByTagName(elementTagName) 
 530  #    if len(elements) == 0:  
 531  #        return None 
 532  #    return elements 
 533   
 534   
 535 -def _getTargetXmlFileByHeader(header, document):  
 536      '''Gets a header reference target xml-file as a DOM tree.''' 
 537      #TODO: try except 
 538      targetFile = getRelsTargetByRId(header.getAttribute('r:id'), document['word/_rels/document.xml.rels']) 
 539      targetFileXml = document['word/' + targetFile] # omaan metodiin 
 540      #targetFileXml = minidom.parseString(targetFile) 
 541      return targetFileXml 
 542   
 543 -def _checkFrontPageHeadersAndFooters(references, document): 
 544      '''Goes through header or footer references and checks if there is any content in them. 
 545       
 546      Checks if there are headers or footers in the front page by looking for <w:t> tags. 
 547      Even if there are references to headers or footers, they might be empty. 
 548       
 549      @param references: Header or footer references. 
 550       
 551      @return: The header or footer target XML file as a DOM tree, or None if no headers or footers were found. 
 552      ''' 
 553       
 554      if references is not None: 
 555          for header in references: 
 556              targetFileXml = _getTargetXmlFileByHeader(header, document) 
 557              #TODO: eksaktimmin, _getElementValueWithinElement? 
 558              if _getElementsWithinElement(targetFileXml, 'w:t') is not None:  
 559                  return targetFileXml 
 560   
 561                   
 562 -def _checkAutomaticPageNumbering(section, headerReference, footerReference, document, errorIds, numStartKey): 
 563      '''Checks if a section has an automatic page numbering and gets the numbering format. 
 564       
 565      First goes through the section element and checks that the numbering starts at 1. 
 566      Gets the section numbering of format definition. 
 567      If it is defined, returns it. 
 568      If a numbering format is not found in the section properties, it defaults to 'Standard'. 
 569      If the numbering format is standard, checks the header and footer references for other numbering format definitions. 
 570      The numbering format in the header or the footer reference is sometimes in <w:instrText> element inside the content of PAGE \* MERGEFORMAT. 
 571           
 572      @param section: The section element to be searched for. 
 573      @param headerReference: The current header of the section element as a DOM tree. 
 574      @param footerReference: The current footer of the section element as a DOM tree. 
 575      @param document: The document as a dict of DOM tree pairs. 
 576      @param errorIds: The dict for appending errors True/False. 
 577      @param numStartKey: The key for errorIds to append numbering start error. 
 578           
 579      @return: The page numbering as a string format, or False if there was no page numbering or the numbering was both in header and footer. 
 580      ''' 
 581      pgNumTypeElement = _getElementWithinElement(section, 'w:pgNumType') 
 582      if pgNumTypeElement is None: 
 583          return False 
 584       
 585      startNum = pgNumTypeElement.getAttribute('w:start') 
 586      if str(startNum) != '1': 
 587          errorIds[numStartKey] = False 
 588      else: 
 589          errorIds[numStartKey] = True 
 590   
 591      numFormat = _getPgNumFormat(pgNumTypeElement) 
 592       
 593      #1. check if instrtext in referenceXml. 
 594      #2. check if 'page' and 'mergeformat' texts are found -> page numbering field is found 
 595      #3. check if there is a pagenumbering format definition in instrtext and return it 
 596      #4. else return numFormat = "Standard" 
 597      if numFormat == "Standard": 
 598          headerFormat = None 
 599          if headerReference is not None: 
 600               
 601              elementValue = _getElementValueWithinElement('w:instrText', headerReference) 
 602              if elementValue is None: 
 603                  headerFormat = None 
 604              elif elementValue.find('PAGE') and elementValue.find('MERGEFORMAT'): 
 605                  splitted = elementValue.split('\*') 
 606                  if len(splitted) > 2: 
 607                      headerFormat = splitted[1].lower().strip() 
 608                       
 609          footerFormat = None 
 610          if footerReference is not None: 
 611               
 612              elementValue = _getElementValueWithinElement('w:instrText', headerReference) 
 613              if elementValue is None: 
 614                  footerFormat = None 
 615              elif elementValue.find('PAGE') and elementValue.find('MERGEFORMAT'): 
 616                  splitted = elementValue.split('\*') 
 617                  if len(splitted) > 2: 
 618                      footerFormat = splitted[1].lower().strip() 
 619       
 620          if headerFormat is not None and footerFormat is not None: 
 621              #TODO: numbering in both header and footer. 
 622              return False 
 623          elif headerFormat is not None and headerFormat != "Standard": 
 624              return headerFormat 
 625          elif footerFormat is not None and footerFormat != "Standard": 
 626              return footerFormat 
 627       
 628      return numFormat 
 629   
 630 -def _checkNameInHeaderOrFooter(reference, document): 
 631      '''Looks for text inside a header or footer and sees if the last modifier's name is in there. 
 632       
 633      Problem: sometimes we want to check that there is no name in the header or the footer. 
 634      If a name is found but it's different from the last modifier's name, result is False, even though a name is in a header/footer. 
 635      For now just tries to check that either the name of the last modifier or just some name was found. 
 636       
 637      @param reference: The header or footer XML file as a DOM tree. 
 638       
 639      @return: True if a name is found in the text, False otherwise. 
 640      ''' 
 641      if reference is not None: 
 642          pElements = _getElementsWithinElement(reference, 'w:p') 
 643          for pElement in pElements: 
 644              textContent = getTextContent(pElement) 
 645               
 646               
 647              #FIXME: if the name is different in the text and in the settings, this could give false negatives: 
 648              #For example, if we don't want that toc section has a name in footer or header and the name is different in the text 
 649              #and in the settings, this function returns False even though there is a name in the header or footer. 
 650              if textContent.find(_getLastModifier(document["docProps/core.xml"])) != -1: 
 651                  return True 
 652               
 653              #FIXME: fix the problem above, then see if this is necessary: 
 654              #When the following if is done, the function can return True even though the name is different in the header or footer 
 655              #and document settings. 
 656               
 657              #check if the content is a digit -> page number 
 658              #split at whitespace -> len > 1 -> probably a name! 
 659              #check if the content is longer than 3 characters -> probably a name. 
 660              if textContent.find('PAGE') == -1: 
 661                  strippedContent = textContent.strip() 
 662                  if strippedContent.isdigit(): 
 663                      continue 
 664                  splittedContent = strippedContent.split() 
 665                  if len(splittedContent) > 1: 
 666                      return True             
 667                  if len(strippedContent) > 3: 
 668                      return True 
 669   
 670      return False 
 671   
 672   
 673 -def _getPgNumFormat(sectionPgNumType): 
 674      ''' Gets the number format of the given section page number type. 
 675           
 676      @param sectionPgNumType: The given page number type element of the section 
 677           
 678      @return: The numbering format, defaults to 'Standard' if nothing else is defined. 
 679      '''  
 680      numFormat = 'Standard' 
 681      if sectionPgNumType: 
 682          if sectionPgNumType.hasAttribute('w:fmt'): 
 683              numFormat = sectionPgNumType.getAttribute('w:fmt') 
 684                   
 685      return numFormat 
 686   
 687 -def checkHeadersAndFooters(document): 
 688      ''' Checks that the headers and footers of a document are made correctly. 
 689       
 690      Assumes that the document has three sections: 
 691          1. cover section 
 692          2. table of contents section or toc section 
 693          3. actual content section or text section 
 694   
 695      @see: checkSections method must pass in order to run this method 
 696       
 697      @note: 
 698      Places findings in the errorIds-dict as key-boolean pairs: 
 699       
 700      'frontPage': was there headers or footers in the cover section. 
 701       
 702      'tocPageNumbering': is there a page numbering in the toc section. 
 703       
 704      'differentPageNumbering': is the page numbering different in the cover and text sections. 
 705       
 706      'nameInToc': is the last modifiers name in toc section header or footer. 
 707       
 708      'nameInText': is the last modifiers name in text section header or footer. 
 709       
 710      'pageNumbering': is there a page numbering in the text section. 
 711       
 712      'tocNumStart': does the toc section page numbering start at 1. 
 713       
 714      'textNumStart': does the text section page numbering start at 1. 
 715       
 716      'titlePg': is the Microsoft Office setting "Different first page" on. 
 717       
 718      @note: XML example: 
 719       
 720      <w:pgNumType w:fmt="lowerRoman" w:start="1"/>)  
 721       
 722      <w:pgNumType w:start="1"/> 
 723        
 724      @return: Findings in the errorIds-dict as key-boolean pairs as described above. 
 725      ''' 
 726       
 727      docXml = document['word/document.xml'] 
 728       
 729      allSectionProperties = getSectionElementsBySections(docXml) 
 730       
 731      errorIds = {'frontPage': None, 'tocPageNumbering': None, 'differentPageNumbering': None, 
 732              'nameInToc': None, 'nameInText': None, 'pageNumbering': None, 'tocNumStart': None, 
 733              'textNumStart': None, 'titlePg': None} # @see XML requirement file 
 734       
 735      currentHdrRef = None 
 736      currentFtrRef = None 
 737   
 738      #allSectionProperties[0] = cover page 
 739      #allSectionProperties[1] = table of contents page  
 740      #allSectionProperties[2] = actual document section 
 741       
 742      coverSection = allSectionProperties[0] 
 743      tocSection = allSectionProperties[1] 
 744      commonSection = allSectionProperties[2] 
 745       
 746      for coverSectPr in coverSection: 
 747       
 748          frontPageHeaderReferences = _getElementsWithinElement(coverSectPr, 'w:headerReference') 
 749          frontPageFooterReferences = _getElementsWithinElement(coverSectPr, 'w:footerReference') 
 750       
 751          frontPageHdrRef = _checkFrontPageHeadersAndFooters(frontPageHeaderReferences, document) 
 752          if frontPageHdrRef is not None: 
 753              currentHdrRef = frontPageHdrRef 
 754               
 755          frontPageFtrRef = _checkFrontPageHeadersAndFooters(frontPageFooterReferences, document) 
 756          if frontPageFtrRef is not None: 
 757              currentFtrRef = frontPageFtrRef 
 758           
 759      if currentHdrRef is not None or currentFtrRef is not None: 
 760          errorIds["frontPage"] = True 
 761      else: 
 762          errorIds["frontPage"] = False 
 763           
 764       
 765      tocSectionNumberingFormat = None 
 766      for tocSectPr in tocSection: 
 767       
 768          tocHdrRefs = _getElementsWithinElement(tocSectPr, 'w:headerReference') 
 769          tocFtrRefs = _getElementsWithinElement(tocSectPr, 'w:footerReference') 
 770       
 771          tocHdrRef = _checkFrontPageHeadersAndFooters(tocHdrRefs, document) 
 772          if tocHdrRef is not None: 
 773              currentHdrRef = tocHdrRef 
 774           
 775          tocFtrRef = _checkFrontPageHeadersAndFooters(tocFtrRefs, document) 
 776          if tocFtrRef is not None: 
 777              currentFtrRef = tocFtrRef 
 778           
 779          tocSectionNumberingFormat = _checkAutomaticPageNumbering(tocSectPr, currentHdrRef, currentFtrRef, document, errorIds, "tocNumStart") 
 780           
 781          if tocSectionNumberingFormat is not False: 
 782              #PAGE NUMBERING IN HEADER AND FOOTER: 
 783              errorIds['tocPageNumbering'] = True 
 784              break 
 785          else: 
 786              errorIds['tocPageNumbering'] = False 
 787       
 788      # is document's writer's name in tocSection header or footer? 
 789      if _checkNameInHeaderOrFooter(currentHdrRef, document) is False and \ 
 790      _checkNameInHeaderOrFooter(currentFtrRef, document) is False: 
 791          errorIds['nameInToc'] = False 
 792      else: errorIds['nameInToc'] = True 
 793   
 794   
 795      commonSectionNumberingFormat = None 
 796      for commonSectPr in commonSection: 
 797          commonHdrRefs = _getElementsWithinElement(commonSectPr, 'w:headerReference') 
 798          commonFtrRefs = _getElementsWithinElement(commonSectPr, 'w:footerReference') 
 799       
 800          commonHdrRef = _checkFrontPageHeadersAndFooters(commonHdrRefs, document) 
 801          if commonHdrRef is not None: 
 802              currentHdrRef = commonHdrRef 
 803           
 804          commonFtrRef = _checkFrontPageHeadersAndFooters(commonFtrRefs, document) 
 805          if commonFtrRef is not None: 
 806              currentFtrRef = commonFtrRef 
 807           
 808          commonSectionNumberingFormat = _checkAutomaticPageNumbering(commonSectPr, currentHdrRef, currentFtrRef, document, errorIds, "textNumStart") 
 809           
 810          if commonSectionNumberingFormat is not False: 
 811              #PAGE NUMBERING IN HEADER AND FOOTER: 
 812              errorIds['pageNumbering'] = True 
 813              break 
 814          else: 
 815              errorIds['pageNumbering'] = False 
 816   
 817      #  is the document's maker's name in the body part of the document 
 818      if _checkNameInHeaderOrFooter(currentHdrRef, document) is True or \ 
 819      _checkNameInHeaderOrFooter(currentFtrRef, document) is True: 
 820          errorIds['nameInText'] = True 
 821      else: errorIds['nameInText'] = False 
 822   
 823      if tocSectionNumberingFormat is not False and commonSectionNumberingFormat is not False: 
 824          if tocSectionNumberingFormat != commonSectionNumberingFormat: 
 825              errorIds['differentPageNumbering'] = True 
 826          else:  
 827              errorIds['differentPageNumbering'] = False 
 828      else: 
 829          errorIds['differentPageNumbering'] = False 
 830   
 831      return errorIds 
 832   
 833 -def getParagraphElementsBySections(docXml, sectionName): 
 834      '''Get paragraph elements of the wanted section. 
 835      The page breaking section break elements changes section, continuous section brake elements don't change section. 
 836   
 837      The first list of the section elements is the cover section. 
 838      The second list of the section elements is the table of contents-section. 
 839      The third list of the section elements is the text section. 
 840      The document has to have at least 3 sections. 
 841       
 842      @param docXml: The document.xml file as a DOM tree. 
 843      @param sectionName: The wanted section can be 'cover', 'toc' or 'text'. 
 844       
 845      @return: The list of the section elements. 
 846      ''' 
 847      
 848      sectionList = [[]] 
 849      #sectionList = [[w:p],[w:p,w:p],[w:p]] 
 850      bodyElement = docXml.getElementsByTagName('w:body')[0] # always exactly 1 element 
 851       
 852      i = 0 
 853       
 854      for textP in bodyElement.childNodes: 
 855          sectionList[i].append(textP) 
 856          sectPrs = textP.getElementsByTagName('w:sectPr') 
 857          if len(sectPrs) != 0: 
 858              typeElement = _getElementWithinElement(sectPrs[0], 'w:type') 
 859              if typeElement is not None: 
 860                  if typeElement.getAttribute('w:val') == 'continuous': 
 861                      continue 
 862              else: 
 863                  i += 1 
 864                  sectionList.append([]) 
 865       
 866      sectionElements = {'cover':sectionList[0], 'toc':sectionList[1], 'text':sectionList[2]} 
 867      if sectionElements.has_key(sectionName): 
 868          return sectionElements[sectionName] 
 869   
 870 -def getSectionElementsBySections(docXml, index = None): 
 871      '''Gets all the w:sectPr elements of a document or optionally the w:sectPr elements of a specific section. 
 872       
 873      w:sectPr elements are stored in a two dimensional list.  
 874      Continuous section breaks are appended to current outer list index. 
 875      The page breaking section raises the outer list index. 
 876       
 877      @param index: The index of the outer pageSections list that is get. None by default. 
 878       
 879      @return: The two dimensional list of all w:sectPr elements if index is None. Otherwise returns the list at the given index. 
 880      ''' 
 881      sectionElements = _getElementsWithinElement(docXml, "w:sectPr") 
 882       
 883      pageSections = [[]] 
 884       
 885      i = 0 
 886       
 887      for section in sectionElements: 
 888          typeElement = _getElementWithinElement(section, 'w:type') 
 889          if typeElement is not None: 
 890              if typeElement.getAttribute('w:val') == 'continuous': 
 891                  pageSections[i].append(section) 
 892          else: 
 893              pageSections[i].append(section) 
 894              pageSections.append([]) 
 895              i += 1 
 896       
 897      if len(pageSections[len(pageSections) - 1]) == 0: 
 898          pageSections.remove([]) 
 899       
 900      if index is None: 
 901          return pageSections 
 902      else: 
 903          return pageSections[index] 
 904   
 905 -def _areSectionsOverlapping(outerParagraphElements, innerParagraphElements, errorList, errorMsg, expectedResult): 
 906      '''Goes through two lists of paragraph elements checking if the same paragraph is in both lists. 
 907       
 908      @param outerParagraphElements: The outer paragraphlist to be searched for. 
 909      @param innerParagraphElements: The inner pagraphlist to be searched for. 
 910      @param errorList: The list for appending error messages. 
 911      @param errorMsg: The error message to be appended. 
 912      @param expectedResult: Boolean of the expected result. 
 913       
 914      @return: expectedResult changed or unchanged. 
 915      ''' 
 916      found = False 
 917      for coverElement in outerParagraphElements: 
 918          if found is True: 
 919              break 
 920          for element in innerParagraphElements: 
 921              if coverElement.isSameNode(element): # is the table of contents in the cover section, where it shouldn't be 
 922                  expectedResult = not expectedResult 
 923                  errorList.append(errorMsg) 
 924                  found = True 
 925                  break 
 926               
 927      return expectedResult 
 928   
 929 -def checkSections(document, errorList): 
 930      '''Goes through the section elements in the document checking that the sections are done properly. 
 931       
 932      There must be at least three sections in the document. 
 933      The cover page and the table of the contents cannot be in the same section. 
 934      Also checks that the Microsoft Office Word setting "Different first page" is off. 
 935       
 936      @return: True if everything went well, False if something went terribly wrong or 
 937                error list if an error was found and the checking could be completed. 
 938      ''' 
 939      docXml = document['word/document.xml'] 
 940      styleXml = document['word/styles.xml'] 
 941      cover = True 
 942      toc = False 
 943       
 944      allSectionProperties = getSectionElementsBySections(docXml) 
 945   
 946      if len(allSectionProperties) < 3: 
 947          return False 
 948   
 949      for section in allSectionProperties: 
 950          for sectPr in section: 
 951              if len(sectPr.getElementsByTagName('w:titlePg')) > 0: 
 952                  errorList.append('titlePg') 
 953                  return False 
 954       
 955      tocParagraphs = _getParagraphElementsBySequentialStyleName('toc', styleXml, docXml) 
 956      coverSectionParagraphs = getParagraphElementsBySections(docXml, 'cover') # cover page 
 957      tocSectionParagraphs = getParagraphElementsBySections(docXml, 'toc') 
 958   
 959      cover = _areSectionsOverlapping(coverSectionParagraphs, tocParagraphs, errorList, "cover", cover) 
 960   
 961      toc = _areSectionsOverlapping(tocSectionParagraphs, tocParagraphs, errorList, "toc", toc) 
 962   
 963      # TODO: error handling, errorIdsAndPositions 
 964      if toc is True and cover is True: 
 965          return True 
 966      else:  
 967          return errorList 
 968   
 969  #def getSectionProperties(document): 
 970  #    ''' Checks all the sectPr-elements. There must be at least three section breaks  (first page, toc-page and content) in the whole document.  
 971  #        The margins should be the same throughout the whole document? 
 972  #         
 973  #        @return: section properties in a dict''' 
 974  #     
 975  #    docXml = document['word/document.xml'] 
 976  #     
 977  #    finalSectionProperties = dict([['w:w', None], ['w:h', None], ['w:top', None], ['w:right', None], ['w:bottom', None], \ 
 978  #                               ['w:left', None], ['w:right', None], ['w:header', None], ['w:footer', None], \ 
 979  #                               ['w:gutter', None], ['w:start', None], ['w:space', None], ['w:linePitch', None], ['w:titlePg', None]]) 
 980  #     
 981  #    allSectionProperties = _getElements('w:sectPr', docXml) # returns a nodeList 
 982  #     
 983  #    for element in allSectionProperties: 
 984  #        finalSectionProperties = _getStyleDefinitions(element, finalSectionProperties) 
 985  #        #TODO: conversions from twips to cm 
 986  # 
 987  #         
 988  #    return finalSectionProperties 
 989   
 990   
 991 -def _checkPageProperties(allSectionProperties, pageProperties, tagName): 
 992      ''' Goes through all section properties to see that they have coherent property values. 
 993       
 994      If the property value is the same in all section elements, the value is stored in pageProperties. 
 995      If something is different between the sections, it's wrong and the page property is set False. 
 996      For example, if two different section elements have different page top marginal, the property is set False. 
 997       
 998      @param allSectionProperties: All w:sectPr elements of the document. 
 999      @param pageProperties: the allowed page properties are {'top': None, 'right': None, 'bottom': None, 'left': None} or {'w': None, 'h': None}. 
1000      @param tagName: Tag name of the element whose properties are checked. 
1001       
1002      @return: pageProperties dict with coherent page values and incoherent values set as False. 
1003      ''' 
1004   
1005      for element in allSectionProperties: 
1006          for key in pageProperties.keys(): 
1007              size = _getElementWithinElement(element, tagName).getAttribute('w:' + key) 
1008              if pageProperties[key] is None: 
1009                  pageProperties[key] = size 
1010              elif pageProperties[key] != size: 
1011                  pageProperties[key] = False # False means that the information has bee changed. 
1012       
1013      return pageProperties 
1014   
1015 -def _convertSizes(sizesDict): 
1016      for key in sizesDict.keys(): 
1017          value = sizesDict[key] 
1018          value = convertTwipToCm(int(value)) 
1019          roundedValue = round(value, 1) # pyöristetään yhden desimaalin tarkkuudella 
1020          sizesDict[key] = roundedValue 
1021           
1022      return sizesDict 
1023   
1024 -def getPageMarginals(document): 
1025      '''Gets the document page marginals sizes. 
1026       
1027      @return: False if the marginals are not coherent, otherwise a dictionary containing the marginal sizes. 
1028      ''' 
1029      allSectionProperties = _getElementsWithinElement(document['word/document.xml'], 'w:sectPr') 
1030       
1031      pageMarginals = {'top': None, 'right': None, 'bottom': None, 'left': None} 
1032                      # { 'header': None, 'footer': None} 
1033                      # Header and footer sizes can be different in different sections of the document. 
1034                      # For example if one section does not have header of footer at all and other section has them, the size is different. 
1035       
1036      _checkPageProperties(allSectionProperties, pageMarginals, 'w:pgMar')             
1037       
1038      for key in pageMarginals.keys(): 
1039          if pageMarginals[key] == False: 
1040              return False 
1041       
1042      return _convertSizes(pageMarginals) 
1043   
1044 -def getPageSize(document): 
1045      '''Gets the document page sizes. 
1046       
1047      @return: False if the page sizes are not coherent, otherwise a dictionary containing the page width and length. 
1048      ''' 
1049      allSectionProperties = _getElementsWithinElement(document['word/document.xml'], 'w:sectPr') 
1050       
1051      pageSize = {'w': None, 'h': None} # width ja heigth 
1052       
1053      _checkPageProperties(allSectionProperties, pageSize, 'w:pgSz') 
1054       
1055      finalPageSize = {'width': pageSize['w'], 'height':pageSize['h']}  
1056       
1057      for key in finalPageSize.keys(): 
1058          if finalPageSize[key] == False: 
1059              return False 
1060       
1061      return _convertSizes(finalPageSize) 
1062           
1063       
1064 -def _getTitle(coreXml): 
1065      '''Gets the title as set in document setting, None if not found.''' 
1066      return _getElementValueWithinElement('dc:title', coreXml) 
1067   
1068 -def _getCreator(coreXml): 
1069      '''Gets the document creator as set in document setting, None if not found.''' 
1070  #    return mso_meta_inspector._getCreator(coreXml) 
1071      return _getElementValueWithinElement('dc:creator', coreXml) 
1072   
1073 -def _getLastModifier(coreXml): 
1074      '''Gets the document last modifier as set in document setting, None if not found.''' 
1075      return _getElementValueWithinElement('cp:lastModifiedBy', coreXml) 
1076   
1077 -def _getCreateDate(coreXml): 
1078      '''Gets the document creatin date as found in document setting, None if not found.''' 
1079      return _getElementValueWithinElement('dcterms:created', coreXml) 
1080   
1081 -def _getLastModifiedDate(coreXml): 
1082      '''Gets the document last modified date as found in document setting, None if not found.''' 
1083      return _getElementValueWithinElement('dcterms:modified', coreXml) 
1084       
1085 -def _getRevision(coreXml): 
1086      '''Gets revision of the document as found in document setting, None if not found.''' 
1087      return _getElementValueWithinElement('cp:revision', coreXml) 
1088   
1089   
1090 -def _getTextFromParagraph(paragraph): 
1091      '''Gets the text content of <w:t>-elements from the given (paragraph) element. 
1092       
1093      @return: the text content as a string. 
1094      ''' 
1095      eventualText = '' 
1096       
1097      # gets all text elements from the given paragraphs 
1098      textElements = _getElementsWithinElement(paragraph, 'w:t') 
1099      if textElements is not None: 
1100          for i in textElements: 
1101              eventualText += i.firstChild.nodeValue 
1102      return eventualText 
1103       
1104 -def checkTocContent(document): 
1105      '''Checks if all of the headings created in the document are listed in the table of contents.  
1106           
1107      @return: True if toc matches the headings content, False otherwise. 
1108      ''' 
1109       
1110      docXml = document['word/document.xml'] 
1111      styleXml = document['word/styles.xml'] 
1112       
1113      headingParagraphs = _getParagraphElementsBySequentialStyleName("heading", styleXml , docXml) 
1114      #docHeadings = _getTextFromParagraph(headingParagraphs) 
1115           
1116      docTocStyles = _getParagraphElementsBySequentialStyleName('toc', styleXml, docXml) 
1117      #tocHeadings = _getTextFromParagraph(docTocStyles) 
1118   
1119      docHeadings = [] 
1120      for heading in headingParagraphs: 
1121          headingText = _getTextFromParagraph(heading).strip() 
1122          if headingText != "": 
1123              docHeadings.append(headingText) 
1124       
1125           
1126      tocHeadings = [] 
1127      for tocStyle in docTocStyles: 
1128          tocHeadings.append(_getTextFromParagraph(tocStyle).strip()) 
1129       
1130      docHeadingsLength = len(docHeadings) 
1131      tocHeadingsLength = len(tocHeadings) 
1132      listLength = len(docHeadings) # oletuksena docHeadingsien pituus 
1133       
1134      if docHeadingsLength != tocHeadingsLength: 
1135          return False 
1136          # errors.append('Sisällysluettelon otsikkoja on eri määrä kuin dokumentin otsikkoja.') 
1137          #------------------------ # if-else shorthand: x = z if condition else y 
1138          # if docHeadingsLength > tocHeadingsLength: listLength = tocHeadingsLength 
1139          #--------------------------------- else: listLength = docHeadingsLength 
1140           
1141      i = 0 
1142      while i < listLength: 
1143  #        if string.find (tocHeadings[i], docHeadings[i]) == -1: 
1144          if tocHeadings[i].find(docHeadings[i]) == -1: 
1145              return False 
1146          i += 1 
1147               
1148      bookmarkStarts = _getElementsWithinElement(docXml, 'w:bookmarkStart') 
1149      instrElements = _getElementsWithinElement(docXml, 'w:instrText') 
1150      for element in docTocStyles: 
1151          instrElements = _getElementsWithinElement(element, 'w:instrText') 
1152          for instrElement in instrElements: 
1153              instrElementValue = instrElement.firstChild.nodeValue 
1154              if instrElementValue.find('PAGEREF') != -1: # We only want to handle the tags with value including PAGEREF. 
1155                  for bookmark in bookmarkStarts: 
1156                      bookmarkNameValue = bookmark.getAttribute('w:name') 
1157                      if bookmarkNameValue.find(instrElementValue) != -1: # if the same code is in bookmarkStart 
1158                          return False 
1159       
1160      return True 
1161       
1162 -def checkTOC(document): 
1163      ''' Check if table of contents is done correctly. It has to have a page break before (and after) it. 
1164       
1165      @see: checkTocContent -- calls the method if there's a table of contents to be found.     
1166      
1167      @note: XML example: 
1168       
1169      <w:p w:rsidR="004A16ED" w:rsidRDefault="004A16ED" w:rsidP="006158B0"> 
1170       
1171      <w:pPr> 
1172       
1173       
1174      <w:pStyle w:val="Otsikko"/> 
1175       
1176      </w:pPr> 
1177       
1178      <w:r w:rsidRPr="006158B0"> 
1179       
1180      <w:lastRenderedPageBreak/> 
1181       
1182      <w:t>SISALLYSLUETTELO</w:t> 
1183       
1184      </w:r> 
1185       
1186      </w:p> 
1187       
1188      <w:p w:rsidR="002274FC" w:rsidRDefault="00FA6E61"> 
1189       
1190      <w:pPr> 
1191       
1192      <w:pStyle w:val="Sisluet1"/> 
1193          
1194      @return: True if toc is made correctly, False otherwise. 
1195      ''' 
1196       
1197      styleId = _getStyleIdByName("toc 1", document['word/styles.xml']) # Sisluet1 
1198      #if (styleId == None): errors.append('There is no table of contents.') 
1199      pStyles = _getElementsWithinElement(document['word/document.xml'], 'w:pStyle') 
1200       
1201      if pStyles is None: 
1202          return False 
1203       
1204      for style in pStyles: 
1205          if (style.getAttribute('w:val') == styleId): 
1206              return True 
1207           
1208              #checkTocContent() # tarkistaa onko sisällysluettelo päivitetty 
1209               
1210               
1211  #            try: #TODO: should a page break after the table of contents as well, what about section break? 
1212  #                #TODO: it's about section break, this code was made for page break.  
1213  #                if (pageBreak == style.parentNode.parentNode.previousSibling.getElementsByTagName('w:r')[0].firstChild.tagName): 
1214  #                    print "There is a page break before table of contents." 
1215  #                    return 
1216  #            except IndexError: 
1217  #                errors.append("There is no page break before the table of contents.") 
1218  #                return 
1219               
1220      return False     
1221      #if not tocExists: errors.append("There is no table of contents at all.") 
1222       
1223               
1224      #return styleId     
1225           
1226 -def checkCoverPage(document): 
1227      ''' Checks if the front page is done correctly 
1228       
1229      @return: coverPageText dictionary containing True or False values. 
1230      ''' 
1231       
1232      # if rakenne on oikein, do -- rakenteen tarkastus lisättävä alkuun 
1233       
1234      coverPageText = { 'email': False, 
1235                        'name': False, 
1236                        'title': False } 
1237       
1238      docXml = document['word/document.xml'] 
1239      coreXml = document['docProps/core.xml'] 
1240   
1241      paragraphs = _getElementsWithinElement(docXml, 'w:p') 
1242      allSectionProperties = getSectionElementsBySections(docXml, 0) 
1243       
1244      firstPageText = '' 
1245      lastParagraphOfFirstPage = allSectionProperties[-1].parentNode.parentNode 
1246       
1247      # Saves the content of the first page, getting text from the beginning and 
1248      # breaks the loop when the sectPr-node appears. 
1249      # Pitää katsoa, ettei etusivulla ole sisällysluetteloa ym., koska jos section-breakit on väärin,  
1250      # "etusivun" tietoihin voi valua sisällysluettelo. 
1251      for element in paragraphs: 
1252          firstPageText += getTextContent(element) 
1253          if element.isSameNode(lastParagraphOfFirstPage): 
1254              break 
1255   
1256      # title pitää myös löytyä täältä 
1257      if firstPageText.find('@') != -1: # sähköpostiosoite tulee lopuksi käyttöliittymästä  
1258          coverPageText['email'] = True 
1259      if firstPageText.find(_getLastModifier(coreXml)) != -1: 
1260          coverPageText['name'] = True 
1261      title = _getTitle(coreXml) 
1262      if title is not None: 
1263          if firstPageText.find(title) != -1: 
1264              coverPageText['title'] = True 
1265   
1266      return coverPageText 
1267       
1268 -def getRelsTargetByRId(rId, rels): 
1269      '''Returns the value of Target attribute of a Relationship element with the given id in a given rels file. 
1270      The value of Target attribute can be for example a relative path to local XML files or images. It can also be a hyperlink.  
1271       
1272      @param rId: Id attribute value of a Relationship element. 
1273      @param rels: rels file as a DOM tree. 
1274       
1275      @return: The value of Target attribute if found. 
1276      ''' 
1277      for relationship in rels.getElementsByTagName('Relationship'): 
1278          if rId == relationship.getAttribute('Id'): 
1279              return relationship.getAttribute('Target') 
1280       
1281 -def getParentParagraph(element, tag='w:p'): 
1282      ''' Returns the parent <w:p>-element of a given element if there is one. 
1283       
1284      @param element: The element whose parent <w:p> element is searched for. 
1285      @param tag: The parent tagname, defaults to 'w:p'. 
1286       
1287      @return: The parent element, or None if no parent is found. 
1288      ''' 
1289   
1290      parent = element.parentNode 
1291       
1292      while parent is not None: 
1293          try: 
1294              if parent.tagName == tag: 
1295                  return parent 
1296              else: 
1297                  parent = parent.parentNode 
1298          except AttributeError: 
1299              return None 
1300      return None 
1301   
1302 -def checkImages(document): 
1303      ''' Check if there is an image in the document. 
1304       
1305      @return: True if even one image is found, False otherwise. 
1306      ''' 
1307       
1308      #TODO: what is the difference between pic:pic and w:pict? 
1309      #w:pict is used when pasting a chart from powerpoint or excel? 
1310      picElements = document['word/document.xml'].getElementsByTagName('pic:pic') 
1311      pictElements = document['word/document.xml'].getElementsByTagName('w:pict') 
1312       
1313      if len(picElements) > 0: 
1314          return True 
1315      if len(pictElements) > 0: 
1316          return True 
1317      return False 
1318       
1319 -def getImagePaths(document): 
1320      ''' Gets the image paths or the file names of the images used in the document.  
1321       
1322      @return: The image targets as strings in a list. 
1323      ''' 
1324      targets = [] 
1325       
1326      picElements = document['word/document.xml'].getElementsByTagName('pic:pic') 
1327      picElements += document['word/document.xml'].getElementsByTagName('w:pict') 
1328       
1329      for picElement in picElements: 
1330          picRId = picElement.getElementsByTagName('a:blip')[0].getAttribute('r:embed') 
1331          targets.append(getRelsTargetByRId(picRId, document['word/_rels/document.xml.rels'])) 
1332      return targets 
1333       
1334 -def checkImageCaptions(document): 
1335      '''Checks if the next paragraph after a picture paragraph uses the caption style. 
1336       
1337      Also checks that the caption contains an automatic field. 
1338      Goes through all picture paragraphs. 
1339       
1340      @return: True if all images have captions, False otherwise. 
1341      ''' 
1342  #    <w:p w:rsidR="0011423F" w:rsidRDefault="00FE23CD" w:rsidP="00A72640"> 
1343  #    <w:pPr> 
1344  #        <w:pStyle w:val="Kuvanotsikko"/> 
1345  #    </w:pPr> 
1346  #    <w:bookmarkStart w:id="10" w:name="_Ref247712443"/> 
1347  #    <w:r> 
1348  #        <w:t xml:space="preserve">Kuva </w:t> 
1349  #    </w:r> 
1350  #    <w:fldSimple w:instr=" SEQ Kuva \* ARABIC "> 
1351  #        <w:r> 
1352  #            <w:t>1</w:t> 
1353  #        </w:r> 
1354  #    </w:fldSimple> 
1355  #    − 
1356  #    <w:r w:rsidR="00876DBA"> 
1357  #        <w:t>Kurssijako</w:t> 
1358  #    </w:r> 
1359  #    <w:bookmarkEnd w:id="10"/> 
1360  #    </w:p> 
1361  #     
1362      styleXml = document['word/styles.xml'] 
1363       
1364      picParagraphs = [] 
1365      picElements = document['word/document.xml'].getElementsByTagName('pic:pic') 
1366      picElements += document['word/document.xml'].getElementsByTagName('w:pict') 
1367       
1368      for pic in picElements: 
1369          picParagraphs.append(getParentParagraph(pic)) 
1370       
1371      if picParagraphs is None: 
1372          return False 
1373   
1374      for p in picParagraphs: 
1375          try: 
1376              captionParagraph = p.nextSibling 
1377              captionParagraphPpr = captionParagraph.getElementsByTagName('w:pPr')[0] 
1378              captionParagraphStyleID = captionParagraphPpr.getElementsByTagName('w:pStyle')[0].getAttribute('w:val') 
1379              captionParagraphStyleName = _getStyleNameById(captionParagraphStyleID, styleXml) 
1380              if captionParagraphStyleName != "caption": 
1381                  return False 
1382          except: 
1383              return False 
1384           
1385          #Find 'SEQ' in either one of the attribute values in the paragraph or as a text node. It indicates an automatic field. 
1386          if getAttributeContent(captionParagraph).find('SEQ') == -1 and \ 
1387             getTextContent(captionParagraph).find('SEQ') == -1: 
1388              return False 
1389              
1390      return True 
1391   
1392 -def checkStyleUsage(document, errorIdsAndPositions): 
1393      '''Checks that text paragraphs are using styles and that no manual style definitions are made. 
1394       
1395      Goes through all paragraph-elements in a document looking for <w:pStyle>-elements. 
1396      Gets the style definitions to see if there are manual changes. 
1397       
1398      @note: Exception: 
1399       
1400      Automatically generated table on contents can contain "manual" style definitions. 
1401      The <w:sectPr> elements within paragraph elements are skipped also. 
1402       
1403      @param errorIdsAndPositions: A dict for error strings. Should contain keys 'manualChanges' and 'styleNotUsed'. 
1404       
1405      @return: True if nothing was found, False if even one error was found. 
1406      ''' 
1407       
1408      paragraphs = document['word/document.xml'].getElementsByTagName('w:p') 
1409      styleXml = document['word/styles.xml'] 
1410       
1411      for p in paragraphs: 
1412          styleDefinitions = {'w:autoRedefine': None, 
1413                         'w:left': None, 
1414                         'w:right': None, 
1415                         'w:firstLine': None, 
1416                         'w:line': None, 
1417                         'w:before': None, 
1418                         'w:after': None, 
1419                         'w:widowControl': None, 
1420                         'w:jc': None, 
1421                         'w:sz': None, 
1422                         'w:ascii': None, 
1423                         'w:asciiTheme': None, 
1424                         'w:b': None, 
1425                         'w:i': None, 
1426                         'w:u': None, 
1427                         #'w:outline': None, 
1428                         #'w:numId': None, 
1429                         #'w:ilvl': None, 
1430                         #'w:lang': None, 
1431                         'w:keepNext': None, 
1432                         'w:keepLines': None, 
1433                         'w:pageBreakBefore': None} 
1434           
1435          try: 
1436              style = p.getElementsByTagName('w:pStyle')[0].getAttribute('w:val') 
1437              style = _getStyleNameById(style, styleXml) 
1438               
1439  # It seems that Word (2010) makes style definitions in document.xml when generating an automatic table of contents: 
1440              if style.startswith('toc'): 
1441                  continue 
1442  #            if style != "": 
1443  #                print style 
1444         
1445  # TODO: check the empty paragraphs content to prevent false positives, are they empty text paragraphs or maybe a picture paragraph etc? 
1446  # Now just leaves the paragraphs with no text be and doesn't give an error. 
1447   
1448          except: 
1449              #pContent = getTextContent(p) 
1450              pContent = _getTextFromParagraph(p) 
1451              if pContent.strip() != "": 
1452                  #errors.append("No style used in paragraph: " + str(pContent[:25])) 
1453                  errorIdsAndPositions['styleNotUsed'].append(pContent[:30]) 
1454           
1455          try: 
1456              for paragraphProperties in p.getElementsByTagName('w:pPr'): 
1457                  for propertyElement in paragraphProperties.childNodes: 
1458  # We don't want section properties to be mixed up as manually made style definitions. 
1459                      if propertyElement.tagName != "w:sectPr": 
1460                          styleDefinitions = _getStyleDefinitions(propertyElement, styleDefinitions) 
1461              for runProperties in p.getElementsByTagName('w:rPr'): 
1462                  styleDefinitions = _getStyleDefinitions(runProperties, styleDefinitions) 
1463                   
1464              for key in styleDefinitions.keys(): 
1465                  if styleDefinitions[key] is not None: 
1466                      pContent = _getTextFromParagraph(p) 
1467                      if pContent.strip() != "": 
1468                          errorIdsAndPositions['manualChanges'].append(pContent[:50]) 
1469                          break 
1470          except: 
1471              continue 
1472           
1473      for key in errorIdsAndPositions.keys(): 
1474          if len(errorIdsAndPositions[key]) > 0: 
1475              return False 
1476      return True 
1477           
1478 -def checkEndnotesAndFootnotes(document): 
1479      ''' Checks if there is an endnote or a footnote in the document. 
1480       
1481      Looks for w:endnoteReference and w:footnoteReference elements. 
1482       
1483      @return: True if an endnote or a footnote is found, False otherwise. 
1484      ''' 
1485      docXml = document["word/document.xml"] 
1486       
1487      endnotes = docXml.getElementsByTagName('w:endnoteReference') 
1488      footnotes = docXml.getElementsByTagName('w:footnoteReference') 
1489   
1490      if len(endnotes) != 0: return True 
1491      if len(footnotes) != 0: return True 
1492       
1493      # TODO: doublecheck: find the endnote with the id in endnotes.xml. That's where the endnote text is located. 
1494      # TODO: doublecheck: find the footnote in footnotes.xml 
1495       
1496      return False 
1497           
1498 -def checkCrossRefrenceToImageCaption(document): 
1499      ''' Goes through images' captions looking for a reference. Then checks if the caption is referenced somewhere. 
1500       
1501      @return: True if a cross reference is found, False otherwise. 
1502      ''' 
1503      #TODO: not implemented in word_processing 13.5.2011 
1504       
1505      docXml = document['word/document.xml'] 
1506       
1507      picParagraphs = [] 
1508      picElements = document['word/document.xml'].getElementsByTagName('pic:pic') 
1509      picElements += document['word/document.xml'].getElementsByTagName('w:pict') 
1510       
1511      for pic in picElements: 
1512          picParagraphs.append(getParentParagraph(pic)) 
1513       
1514      for p in picParagraphs: 
1515          captionParagraph = p.nextSibling 
1516          try: 
1517              bookmarkStartElement = captionParagraph.getElementsByTagName('w:bookmarkStart')[0] 
1518          except: 
1519              # raise an error: errors.append('Picture\'s caption reference not found.') 
1520              return False 
1521          reference = bookmarkStartElement.getAttribute('w:name') 
1522           
1523          for element in docXml.getElementsByTagName('w:instrText'): 
1524              elementText = getTextContent(element) 
1525              if elementText.find(reference) != -1: 
1526                  return True 
1527      # raise an error: errors.append('Reference to picture caption not found.') 
1528      return False 
1529   
1530 -def _getElementByAttributeValue(nodeList, attributeName, attributeValue): 
1531      '''Gets an element by an attribute value. 
1532       
1533      @param nodeList: A list of elements to be searched for. 
1534      @param attributeName: The name of the wanted attribute. 
1535      @param attributeValue: The wanted value of the attribute. 
1536       
1537      @return: The element, if it has an attribute with the wanted value, None otherwise. 
1538      ''' 
1539      for element in nodeList: 
1540          if element.getAttribute(attributeName) == attributeValue: 
1541              return element 
1542      return None 
1543   
1544 -def _isStyleUsed(document, styleName): 
1545      '''Checks that a style is used in the document. 
1546       
1547      @param styleName: The name of the style looked for. 
1548       
1549      @return: True if the style is used, False otherwise. 
1550      ''' 
1551      docXml = document['word/document.xml'] 
1552      styleXml = document['word/styles.xml'] 
1553       
1554      styleId = _getStyleIdByName(styleName, styleXml) 
1555      bodyParagraphs = _getParagraphElementsByStyleId(docXml, styleId) 
1556       
1557      if len(bodyParagraphs) > 0: 
1558          return True 
1559      return False 
1560   
1561  #def checkHeadingUsage(document): 
1562  #    '''Check if heading styles are used in the document. 
1563  #     
1564  #    @return: True if heading styles are used, False otherwise. 
1565  #    ''' 
1566  #    docXml = document['word/document.xml'] 
1567  #    styleXml = document['word/styles.xml'] 
1568  #     
1569  #    headingParagraphs = _getParagraphElementsBySequentialStyleName("heading", styleXml, docXml) 
1570  #    if len(headingParagraphs) == 0: 
1571  #        #errors.append("No heading styles used in this document!") 
1572  #        return False 
1573  #     
1574  #    usedHeadingStyles = [] 
1575  #    for heading in headingParagraphs: 
1576  #        styleId = heading.getElementsByTagName('w:pStyle')[0].getAttribute('w:val') 
1577  #        if usedHeadingStyles.count(styleId) == 0: 
1578  #            usedHeadingStyles.append(str(styleId)) 
1579  #     
1580  #    #FIXME: not the most dynamic way: 
1581  #    if len(usedHeadingStyles) < 2: 
1582  #        return False 
1583  #     
1584  #    return True 
1585   
1586 -def checkHeadingNumbering(document, errorIdsAndPositions): 
1587      '''Checks the headings in the document. 
1588       
1589      Goes through the heading styles used in the document checking that they use a multilevel numbering, 
1590      the numbering is done correctly using styles and that the numbering is connected to other heading styles. 
1591       
1592      Gets all the heading styles used in the document. 
1593      Searches for the heading's numbering definition reference in styles.xml. 
1594      Next searches the associated numbering definition in numbering.xml. 
1595      Next searches the correct numbering level definition associated to the heading. 
1596      Checks that the numbering is multilevel and done correctly using the heading styles. 
1597       
1598      @note: XML example: 
1599       
1600      styles.xml: 
1601       
1602      <w:style w:type="paragraph" w:styleId="Heading2"> - Heading 2 style definition 
1603       
1604      <w:name w:val="heading 2"/> 
1605       
1606      <w:pPr> 
1607       
1608      <w:numPr> 
1609       
1610      <w:ilvl w:val="1"/> - Numbering Level Reference 
1611       
1612      <w:numId w:val="1"/> - Numbering Definition Instance Reference 
1613       
1614      </w:numPr> 
1615       
1616      <w:outlineLvl w:val="1"/> 
1617       
1618      </w:pPr> 
1619       
1620      </w:style> 
1621       
1622      numbering.xml: 
1623       
1624      <w:abstractNum w:abstractNumId="0"> - Abstract Numbering Definition 
1625       
1626      <w:multiLevelType w:val="multilevel"/> - Abstract Numbering Definition Type 
1627       
1628      <w:lvl w:ilvl="0"> - </w:lvl> - Numbering Level Definition 
1629               
1630      <w:lvl w:ilvl="1"> - Numbering Level Definition 
1631       
1632      <w:start w:val="1"/> - Starting Value 
1633       
1634      <w:numFmt w:val="decimal"/> - Numbering Format 
1635       
1636      <w:pStyle w:val="Heading2"/> - Paragraph Style's Associated Numbering Level 
1637       
1638      <w:lvlText w:val="%1.%2"/> - Numbering Level Text 
1639       
1640      <w:lvlJc w:val="left"/> - Justification 
1641       
1642      <w:pPr> - Numbering Level Associated Paragraph Properties 
1643       
1644      <w:ind w:left="576" w:hanging="576"/> 
1645       
1646      </w:pPr> 
1647       
1648      </w:lvl> 
1649       
1650      </w:abstractNum> 
1651   
1652      <w:num w:numId="1"> - Numbering Definition Instance 
1653       
1654      <w:abstractNumId w:val="0"/> - Abstract Numbering Definition Reference 
1655       
1656      </w:num> 
1657       
1658      @param errorIdsAndPositions: A dict for appending errors in key - stringlist pairs.  
1659      Should contain the following keys:  
1660        - 'manualNumbering' -- numbering is done manually somehow. 
1661        - 'styleNotUsed' -- an expected heading style is not used. 
1662        - 'differentNumbering' -- some heading style is using different numbering than some other heading styles. 
1663        - 'notMultilevel' -- the numbering is not multilevel. 
1664        - 'outlineLvl' -- the outline of a heading style is not correct. 
1665        - 'numStart' -- the numbering doesn't start at 1. 
1666        - 'numWrong' -- the numbering is somehow not done with styles. 
1667        - 'numFormat' -- the numbering format is not correct. 
1668        - 'notSequential' -- heading styles are not used correctly in a row for example heading 3 is used after heading 1. 
1669      ''' 
1670       
1671      #errorIdsAndPositions = {'manualNumbering': None} 
1672      docXml = document["word/document.xml"] 
1673      styleXml = document["word/styles.xml"] 
1674      
1675      try: 
1676          numXml = document['word/numbering.xml'] 
1677          #numFile = zip.read('word/numbering.xml') 
1678          #numXml = xml.dom.minidom.parseString(numFile) 
1679      except: 
1680          #errors.append("No heading numbering used at all.") 
1681          return False 
1682      
1683       
1684      headingParagraphs = _getParagraphElementsBySequentialStyleName("heading", styleXml, docXml) 
1685      #@see: checkHeadingUsage 
1686      #if len(headingParagraphs) == 0: 
1687      #    errors.append("No heading styles used in this document!") 
1688      #    return 
1689       
1690      usedHeadingsStyleIds = [] 
1691      previousHeadingLevel = 0 
1692      for heading in headingParagraphs: 
1693          styleId = heading.getElementsByTagName('w:pStyle')[0].getAttribute('w:val') 
1694          if len(heading.getElementsByTagName('w:ilvl')) > 0 or \ 
1695             len(heading.getElementsByTagName('w:numId')) > 0: 
1696              errorIdsAndPositions['manualNumbering'] = getTextContent(heading) 
1697              # errors.append("Manual numbering definitions made in heading: " + getTextContent(heading)) 
1698          if usedHeadingsStyleIds.count(styleId) == 0: 
1699              usedHeadingsStyleIds.append(str(styleId)) 
1700          headingLevel = int(styleId[len(styleId) - 1]) 
1701          if fabs(headingLevel - previousHeadingLevel) > 1: 
1702              #errors.append("Otsikoita ei ole käytetty oikealla tavalla peräkkäin.") 
1703              errorIdsAndPositions['notSequential'] = getTextContent(heading) 
1704          previousHeadingLevel = headingLevel 
1705       
1706      #Sort the list: ['Heading1', 'Heading2', 'Heading3', ...] 
1707      usedHeadingsStyleIds.sort(cmp=None, key=None, reverse=False) 
1708      #print usedHeadingsStyleIds 
1709       
1710      previousNumId = None 
1711       
1712      for headingStyleId in usedHeadingsStyleIds: 
1713   
1714          headingLevel = int(headingStyleId[len(usedHeadingsStyleIds[0]) - 1]) 
1715   
1716          headingStyleElement = _getStyleElementById(headingStyleId, styleXml) 
1717           
1718          # Get the numbering definitions of the heading style. 
1719          # Default ilvl value to 0 -> ilvl-element not found (level is 0). 
1720          styleDefinitions = {'w:ilvl': '0', 'w:numId': None, 'w:outlineLvl': None} 
1721          styleDefinitions = _getStyleDefinitions(headingStyleElement, styleDefinitions) 
1722           
1723          # Chekc that the numbering style definitions are OK. 
1724          if styleDefinitions['w:numId'] is None: 
1725              errorIdsAndPositions['styleNotUsed'] = headingStyleId 
1726              #errors.append(headingStyleId + " numbering is not used.") 
1727              #return 
1728          if previousNumId is not None and styleDefinitions['w:numId'] != previousNumId: 
1729              errorIdsAndPositions['differentNumbering'] = headingStyleId 
1730              #errors.append(headingStyleId + " is using different numbering as the previous level heading style.") 
1731          previousNumId = styleDefinitions['w:numId'] 
1732          if int(styleDefinitions['w:ilvl']) != headingLevel - 1: 
1733              errorIdsAndPositions['notMultilevel'] = headingStyleId 
1734              #errors.append(headingStyleId + " numbering level is not correct, numbering is not multilevel.") 
1735          if int(styleDefinitions['w:outlineLvl']) != headingLevel - 1: 
1736              errorIdsAndPositions['outlineLvl'] = headingStyleId 
1737              #errors.append(headingStyleId + " outline level is not correct.") 
1738           
1739          # Find the numbering definition element associated to the heading style. 
1740          # Get the abstract numbering definition id from the numbering definition element. 
1741          # Find the abstract numbering definition element with the correct id. 
1742          # Find the numbering level definition with the same level that the heading style. 
1743          try: 
1744              numElement = _getElementByAttributeValue(numXml.getElementsByTagName('w:num'), 'w:numId', styleDefinitions['w:numId']) 
1745              abstractNumId = numElement.getElementsByTagName('w:abstractNumId')[0].getAttribute('w:val') 
1746              absNumElement = _getElementByAttributeValue(numXml.getElementsByTagName('w:abstractNum'), 'w:abstractNumId', abstractNumId) 
1747              lvlElement = _getElementByAttributeValue(absNumElement.getElementsByTagName('w:lvl'), 'w:ilvl', styleDefinitions['w:ilvl']) 
1748          except: 
1749  #            errors.append(headingName + " numbering level definitions not found.") 
1750              continue 
1751           
1752          # Get the numbering level definitions. 
1753          numDefinitions = {'w:start': None, 'w:numFmt': None, 'w:pStyle': None, 'w:lvlText': None, 'w:lvlJc': None, 'w:tentative': None} 
1754          numDefinitions = _getStyleDefinitions(lvlElement, numDefinitions) 
1755           
1756  # TODO: should we check that the numbering is in format 1, 1.1, 1.1.1 etc ? 
1757          if numDefinitions['w:start'] != '1': 
1758              errorIdsAndPositions['numStart'] = headingStyleId 
1759              #" numbering doesn't start at number 1.") 
1760          if numDefinitions['w:pStyle'] != headingStyleId: 
1761              errorIdsAndPositions['numWrong'] = headingStyleId 
1762              #" numbering is not done correctly using heading styles.") 
1763          if numDefinitions['w:numFmt'] != "decimal": 
1764              errorIdsAndPositions['numFormat'] = headingStyleId 
1765              #errors.append(headingStyleId + " numbering format is not a decimal number.") 
1766       
1767      return True 
1768       
1769 -def _getParagraphElementsByStyleId(docXml, styleId): 
1770      ''' Gets all paragraph-elements in the document by a style id.''' 
1771      paragraphList = [] 
1772       
1773      for p in docXml.getElementsByTagName('w:p'): 
1774          try: 
1775              if styleId == p.getElementsByTagName('w:pStyle')[0].getAttribute('w:val'): 
1776                  paragraphList.append(p) 
1777          except: 
1778              continue 
1779              #errors.append("No style used in paragraph: " + getTextContent(p)) 
1780      return paragraphList 
1781       
1782 -def _getParagraphElementsBySequentialStyleName(styleNamePrefix, styleXml, docXml): 
1783      ''' Return all paragraph elements that use a style name with a sequential numbering. 
1784       
1785      Gets all paragraphs that use styles with stylenames for example heading 1, heading 2, etc or 
1786      index 1, index 2, etc. 
1787       
1788      @param styleNamePrefix: The prefix of the sequential style name. 
1789      ''' 
1790      paragraphs = [] 
1791      i = 1 
1792      styleNamePrefix = styleNamePrefix.strip() + " " 
1793       
1794      while(True): 
1795          styleId = _getStyleIdByName(styleNamePrefix + str(i), styleXml) 
1796          if styleId is None: 
1797              break 
1798          else: 
1799              paragraphs += _getParagraphElementsByStyleId(docXml, styleId) 
1800              i += 1 
1801      return paragraphs 
1802       
1803 -def checkIndex(document): 
1804      '''Checks that the document has an automatically made index. 
1805       
1806      @return: False if an index is missing, '2' if index is not automatically made and True if everything was OK. 
1807      ''' 
1808      docXml = document['word/document.xml'] 
1809      styleXml = document['word/styles.xml'] 
1810   
1811      indexParagraphs = _getParagraphElementsBySequentialStyleName("index ", styleXml, docXml) 
1812      if len(indexParagraphs) == 0:  
1813          return False 
1814      #if len(indexParagraphs) != 0: return True 
1815       
1816      # The previous w:p element of the first index entry should be something like this: 
1817      #<w:p w:rsidR="002F2A09" w:rsidRDefault="00CA51D5"> 
1818          #<w:pPr> 
1819              #<w:sectPr w:rsidR="002F2A09" w:rsidSect="002F2A09"> --- </w:sectPr> 
1820          #</w:pPr> 
1821          #<w:r> 
1822              #<w:fldChar w:fldCharType="begin"/> 
1823          #</w:r> 
1824          #<w:r> 
1825              #<w:instrText xml:space="preserve"> INDEX \c "2" \z "1035" </w:instrText> 
1826          #</w:r> 
1827          #<w:r> 
1828              #<w:fldChar w:fldCharType="separate"/> 
1829          #</w:r> 
1830      #</w:p> 
1831       
1832      # Search the 'instrText' field element and be sure that it's an index field element. 
1833      try: 
1834          indexFieldCodeElement = indexParagraphs[0].previousSibling.getElementsByTagName('w:instrText')[0] 
1835      except: 
1836          indexFieldCodeElement = None 
1837      # There can be a section brake between the first index entry and the field declaration? 
1838      if indexFieldCodeElement is None: 
1839          try: 
1840              indexFieldCodeElement = indexParagraphs[0].previousSibling.previousSibling.getElementsByTagName('w:instrText')[0] 
1841          except: 
1842              indexFieldCodeElement = None         
1843       
1844      if indexFieldCodeElement is None: 
1845          #errors.append('Index is not a field - make the index automatically, not manually.') 
1846          return '2' 
1847      elif getTextContent(indexFieldCodeElement).find('INDEX') == -1: 
1848          #errors.append('INDEX-text not found in field declaration - make the index automatically, not manually.') 
1849          return '2' 
1850       
1851      return True 
1852           
1853 -def checkIndexContent(document): 
1854      ''' Checks that the document has a index that is not empty, and that the index entries are referenced somewhere in the document. 
1855       
1856      First gets all the index styles' definitions from styles.xml and finds paragraphs using the styles in the document.xml. 
1857      Checks that there is a field code element indicating that the index is generated automatically. 
1858      Collects the content of the index and checks it isn't empty. 
1859      Finds references to the index entries and matches them to the index content.   
1860       
1861      @note: XML example: 
1862       
1863      Index example: 
1864       
1865      <w:p w:rsidR="002F2A09" w:rsidRDefault="00CA51D5"> 
1866       
1867      <w:r> 
1868       
1869      <w:fldChar w:fldCharType="begin"/> 
1870       
1871      </w:r> 
1872       
1873      <w:r> 
1874       
1875      <w:instrText xml:space="preserve"> INDEX \c "2" \z "1035" </w:instrText> 
1876       
1877      </w:r> 
1878       
1879      <w:r> 
1880       
1881      <w:fldChar w:fldCharType="separate"/> 
1882       
1883      </w:r> 
1884       
1885      </w:p> 
1886       
1887      <w:p w:rsidR="002F2A09" w:rsidRDefault="002F2A09"> 
1888       
1889      <w:pPr> 
1890       
1891      <w:pStyle w:val="Index1"/> 
1892       
1893      <w:tabs> 
1894       
1895      <w:tab w:val="right" w:leader="dot" w:pos="3950"/> 
1896       
1897      </w:tabs> 
1898       
1899      </w:pPr> 
1900       
1901      <w:r> 
1902       
1903      <w:t>Index entry level 1</w:t> 
1904       
1905      </w:r> 
1906       
1907      </w:p> 
1908       
1909      Reference example: 
1910       
1911      <w:r w:rsidR="00B27B47"> 
1912       
1913      <w:instrText xml:space="preserve"> XE "</w:instrText> 
1914       
1915      </w:r> 
1916       
1917      <w:r w:rsidR="00B27B47" w:rsidRPr="00B27B47"> 
1918       
1919      <w:instrText>Level 1 entry</w:instrText> 
1920       
1921      </w:r> 
1922       
1923      <w:r w:rsidR="00B27B47" w:rsidRPr="00B27B47"> 
1924       
1925      <w:instrText>:</w:instrText> 
1926       
1927      </w:r> 
1928       
1929      <w:r w:rsidR="00B27B47" w:rsidRPr="0011587C"> 
1930       
1931      <w:instrText>Level 2 entry</w:instrText> 
1932       
1933      </w:r>   
1934       
1935      @return: '3' if the index is empty, '4' if the content does not match with the document and True if everything went OK. 
1936      ''' 
1937      docXml = document['word/document.xml'] 
1938      styleXml = document['word/styles.xml'] 
1939       
1940      indexParagraphs = _getParagraphElementsBySequentialStyleName("index ", styleXml, docXml) 
1941      indexTextContent = dict() 
1942      for p in indexParagraphs: 
1943          textContent = getTextContent(p) 
1944          if textContent is not None and textContent != "": 
1945              indexTextContent[textContent] = None 
1946      if len(indexTextContent) == 0: 
1947          #errors.append('Index is empty.') 
1948          return '3' 
1949       
1950      documentFieldTexts = "" 
1951      for pElement in docXml.getElementsByTagName('w:instrText'): 
1952          documentFieldTexts += getTextContent(pElement) 
1953       
1954      # Index entry reference example: 'XE "MainEntry"', 'XE "MainEntry:SubEntry"' or even 'XE "MainEntry:Heading" "Subentry:Heading"' 
1955      # Check that the entries are actually included in the index: 
1956      # Parse the string containing all text content of the w:instrText-elements. 
1957      # First split at XE_ (where _ is whitespace), next split at \" and finally split at \:. 
1958      # Compare the final index entry candidate to the index entries and visa versa to see if they match. 
1959      # If finally some index entry doesn't have any matches, the entry is probably made manually. It's referenced nowhere! 
1960      # TODO: check that the reference makes actually sense? Page number to the index comes from the page where the reference is. 
1961   
1962      indexReferenceFieldsContent = [] 
1963      for field in documentFieldTexts.split('XE '): 
1964          for candidate in field.split('\"'): 
1965              for finalCandidate in candidate.split(":"): 
1966                  if finalCandidate.strip() != "": 
1967                      indexReferenceFieldsContent.append(finalCandidate) 
1968       
1969                       
1970      for indexReferenceComponent in indexReferenceFieldsContent: 
1971          #print indexReferenceComponent 
1972          for key in indexTextContent.keys(): 
1973              if key.find(indexReferenceComponent) != -1: 
1974                  indexTextContent[key] = True 
1975                  break 
1976              elif indexReferenceComponent.find(key) != -1: 
1977                  indexTextContent[key] = True 
1978                  break 
1979   
1980      for key in indexTextContent.keys(): 
1981          #print key + " - " + str(indexTextContent[key]) 
1982          if indexTextContent[key] is None: 
1983              #errors.append('No references found for index entry ' + key + '.') 
1984              return '4' 
1985      return True 
1986       
1987 -def checkDoubleWhitespaces(document): 
1988      '''Checks double whitespaces in the document. 
1989       
1990      @return: The amount of occurrences of the double whitespaces found in the document, False otherwise. 
1991      ''' 
1992      return checkStringFromDocument(document['word/document.xml'], '  ') 
1993   
1994 -def checkAsterisk(document): 
1995      '''Checks the *-character in the document. 
1996       
1997      @return: The amount of occurrences of the asterisks found in the document, False otherwise. 
1998      ''' 
1999      return checkStringFromDocument(document['word/document.xml'], '*') 
2000       
2001 -def checkStringFromDocument(docXml, string): 
2002      '''Checks if a string is found in the text content of the document (in the w:t-elements). 
2003      If string is found, returns how many occurences were found in a paragraph. 
2004       
2005      @return: The amount of occurrences of the string is found in the document, False otherwise.  
2006      ''' 
2007      found = False 
2008      count = 0 
2009      for p in docXml.getElementsByTagName('w:p'): 
2010          textContent = "" 
2011          for textElement in p.getElementsByTagName('w:t'): 
2012              textContent += getTextContent(textElement) 
2013          occurrences = textContent.count(string) 
2014          if occurrences > 0: 
2015              count += occurrences 
2016  #            errors.append("\"" + string + "\" occurs " + str(occurrences) + " time(s) in paragraph: " + textContent[:25]) 
2017              found = True 
2018      if found is True: 
2019          return count 
2020      return found 
2021  #    for p in docXml.getElementsByTagName('w:p'): 
2022  #        if(checkStringFromContent(p, "  ")): 
2023  #            errors.append("Double whitespace in paragraph: " + getTextContent(p)) 
2024   
2025 -def checkTabs(document): 
2026      '''Checks if the tabulator is used in the document. 
2027       
2028      @note: Exceptions:  
2029       
2030        - automatically generated table of contents and index contain tabulators. 
2031       
2032        - before an automatically generated index there is a paragraph-element with <instrText>-element and a <tab>-element. 
2033       
2034      @return: The amount of the tabulator occurrences found in the document, False if none was found.      
2035      ''' 
2036      #TODO: More exceptions? 
2037      styleXml = document['word/styles.xml'] 
2038      tabParagraphs = document['word/document.xml'].getElementsByTagName('w:tab') 
2039      tabCount = 0 
2040       
2041      if len(tabParagraphs) == 0: 
2042          return False 
2043       
2044      #tabParagraphContent = [] 
2045      #tabParagraphContent = dict() 
2046       
2047      for tab in tabParagraphs: 
2048          tabParent = getParentParagraph(tab, 'w:p') 
2049          try: 
2050              tabParentStyleId = tabParent.getElementsByTagName('w:pStyle')[0].getAttribute('w:val') 
2051          except: 
2052              continue 
2053          if _getStyleNameById(tabParentStyleId, styleXml).startswith('toc') or \ 
2054             _getStyleNameById(tabParentStyleId, styleXml).startswith('index'): 
2055              continue 
2056          else: 
2057              if getTextContent(tabParent).find('INDEX') != -1: 
2058                  continue 
2059              else: 
2060               
2061                  tabCount += 1 
2062      #        print getTextContent(tabParent) 
2063              #tabParagraphContent.append(getTextContent(tabParent)) 
2064  #            try: 
2065  #                tabParagraphContent[getTextContent(tabParent)] += 1 
2066  #            except KeyError: 
2067  #                tabParagraphContent[getTextContent(tabParent)] = 1 
2068  #             
2069  #    if len(tabParagraphContent) == 0: 
2070      if tabCount == 0: 
2071          return False 
2072       
2073      return tabCount 
2074   
2075 -def isParagraphEmpty(p, styleXml): 
2076      '''Checks if a paragraph is empty. 
2077       
2078      @note: Expections:  
2079       
2080      Picture in the document produces an empty paragraph. 
2081      Empty table cell produces an empty paragraph. 
2082      A table produces an empty paragraph right after the table. 
2083      Objects and graphics produce an empty paragraph. 
2084      ... 
2085       
2086      @param p: The paragraph element under inspection. 
2087       
2088      @return: False if the paragraph is not empty, True if it is empty. 
2089      ''' 
2090      #FIXME: these are surely not the only exceptions. Add more exceptions. 
2091      pContent = _getTextFromParagraph(p).strip() 
2092      if len(pContent) == 0: 
2093          if len(p.getElementsByTagName('pic:pic')) > 0: 
2094              return False 
2095          if len(p.getElementsByTagName('w:sectPr')) > 0: 
2096              return False 
2097          if len(p.getElementsByTagName('w:pict')) > 0: 
2098              return False 
2099          if len(p.getElementsByTagName('w:object')) > 0: 
2100              return False 
2101          if len(p.getElementsByTagName('a:graphic')) > 0: 
2102              return False 
2103          if getParentParagraph(p, 'w:tbl') is not None: 
2104              return False 
2105           
2106          #TODO: try-except on previousSiblings 
2107          if p.previousSibling is not None: 
2108              if p.previousSibling.tagName == 'w:tbl': 
2109                  return False 
2110              styleId = _getParagraphStyleId(p.previousSibling) 
2111              if styleId is not None: 
2112                  styleName = _getStyleNameById(styleId, styleXml) 
2113                  if styleName is not None: 
2114                      if styleName.find('toc') != -1: 
2115                          return False 
2116          if p.previousSibling is not None: 
2117              styleId = _getParagraphStyleId(p.previousSibling) 
2118              if styleId is not None: 
2119                  styleName = _getStyleNameById(styleId, styleXml) 
2120                  if styleName is not None: 
2121                      if styleName.find('index') != -1: 
2122                          return False 
2123              if p.previousSibling.previousSibling is not None: 
2124                  styleId = _getParagraphStyleId(p.previousSibling.previousSibling) 
2125                  if styleId is not None: 
2126                      styleName = _getStyleNameById(styleId, styleXml) 
2127                      if styleName is not None: 
2128                          if styleName.find('index') != -1: 
2129                              return False 
2130                       
2131          #print getTextContent(p) 
2132          #print getTextContent(p.previousSibling) 
2133          #print getTextContent(p.previousSibling.previousSibling) 
2134          return True 
2135      return False 
2136   
2137 -def checkEmptyParagraphs(document): 
2138      ''' Finds all empty paragraphs in the document. 
2139   
2140      @note: Expections: 
2141        
2142      Picture in the document produces an empty paragraph. 
2143      Empty table cell produces an empty paragraph. 
2144      A table produces an empty paragraph right after the table. 
2145      ...? 
2146       
2147      @return: amount of empty paragraph occurrences in the document, False if none was found.                 
2148      ''' 
2149      paragraphs = document['word/document.xml'].getElementsByTagName('w:p') 
2150      #emptyParagraphs = dict() 
2151      emptyParagraphsCount = 0 
2152       
2153      for p in paragraphs: 
2154          result = isParagraphEmpty(p, document['word/styles.xml']) 
2155               
2156          if result is True: 
2157              emptyParagraphsCount += 1 
2158                   
2159      if emptyParagraphsCount == 0: 
2160          return False 
2161      else: 
2162          return emptyParagraphsCount 
2163   
2164 -def checkList(document, listName='List'): 
2165      ''' Goes through all paragraph elements in the document looking for paragraphs that use some list style. 
2166       
2167      @param listName: The list stylename we want to check. Defaults to 'List',  
2168                       which finds list styles such as 'List', 'List Bullet', 'List Numbered'.  
2169       
2170      @return: True, if a list style is used in the document, False otherwise. 
2171      ''' 
2172      docXml = document['word/document.xml'] 
2173      styleXml = document['word/styles.xml'] 
2174       
2175      for p in docXml.getElementsByTagName('w:p'): 
2176          styleId = _getParagraphStyleId(p) 
2177          if styleId is not None: 
2178              styleName = _getStyleNameById(styleId, styleXml) 
2179              if styleName.find(listName) != -1: 
2180                  return True 
2181       
2182      return False 
2183   
2184 -def checkSpreadsheetChart(document): 
2185      '''Checks that the document has a chart copied from a spreadsheet document. 
2186      The Chart must be pasted as a link. 
2187      ''' 
2188       
2189      #TODO: not implemented in word_processing 13.5.2011 
2190      docXml = document['word/document.xml'] 
2191      docRelsXml = document['word/_rels/document.xml.rels'] 
2192       
2193      objectElements = docXml.getElementsByTagName('w:object') 
2194      if len(objectElements) == 0: 
2195          return False 
2196       
2197      for objectElement in objectElements: 
2198          if len(objectElement.getElementsByTagName('v:formulas')) > 0: 
2199               
2200              try: 
2201                  OLEObjectElement = objectElement.getElementsByTagName('o:OLEObject')[0] 
2202              except: 
2203                  continue 
2204   
2205              if OLEObjectElement.getAttribute('ProgID').find('Excel') != -1: 
2206                  #print "Spreadsheet chart is not from Excel." 
2207                  #print OLEObjectElement.getAttribute('Type') 
2208                  if OLEObjectElement.getAttribute('Type') == 'Link': 
2209                       
2210                      rid = OLEObjectElement.getAttribute('r:id') 
2211                      target = getRelsTargetByRId(rid, docRelsXml) 
2212                      targetChart = target.split("!") 
2213                      targetChart.reverse() 
2214                      #Example: targetChart[0] = %5bmalli.xlsx%5dmalli%20Chart%201 
2215                      targetChartName = targetChart[0] 
2216                       
2217                      #TODO: more effective examination required, just check that there is more than three %-characters. 
2218                      if targetChartName.count('%') < 3: 
2219                          return False 
2220                       
2221                      #print targetChartName 
2222                       
2223                       
2224                      return True 
2225                  else: 
2226                      return "Spreadsheet object is not pasted as a link." 
2227      return False 
2228   
2229 -def checkSpreadsheetTable(document): 
2230      '''Checks that the document has a table copied from a spreadsheet document. 
2231      For now checks that the table is pasted as a link. 
2232      ''' 
2233      docXml = document['word/document.xml'] 
2234      docRelsXml = document['word/_rels/document.xml.rels'] 
2235       
2236      objectElements = docXml.getElementsByTagName('w:object') 
2237      if len(objectElements) == 0: 
2238          return False 
2239       
2240      for objectElement in objectElements: 
2241          if len(objectElement.getElementsByTagName('v:formulas')) == 0: 
2242               
2243              try: 
2244                  OLEObjectElement = objectElement.getElementsByTagName('o:OLEObject')[0] 
2245              except: 
2246                  continue 
2247   
2248              if OLEObjectElement.getAttribute('ProgID').find('Excel') != -1: 
2249                  #print "Spreadsheet chart is not from Excel." 
2250                  #print OLEObjectElement.getAttribute('Type') 
2251                  if OLEObjectElement.getAttribute('Type') == 'Link': 
2252                      rid = OLEObjectElement.getAttribute('r:id') 
2253                      target = getRelsTargetByRId(rid, docRelsXml) 
2254                      targetChart = target.split("!") 
2255                      targetChart.reverse() 
2256                      targetTableCells = targetChart[0] 
2257                       
2258                      #Example: targetTableCells = R1C1:R7C4 
2259                      #TODO: more effective examination might be required, just check that 2 R- and 2 C-characters are found. 
2260                      if targetTableCells.count('R') != 2: 
2261                          return False 
2262                      if targetTableCells.count('C') != 2: 
2263                          return False 
2264                      #print targetTableCells 
2265                       
2266                      return True 
2267                  else: 
2268                      return "Spreadsheet object is not pasted as a link." 
2269      return False 
2270   
2271 -def checkPresentationGraphicsChart(document): 
2272      '''Checks that the document contains a chart pasted from PowerPoint as a vector graphics picture or as an object. 
2273      Doesn't really know if the picture or object is actually from PowerPoint! 
2274      ''' 
2275       
2276      #TODO: not implemented in word_processing 13.5.2011 
2277       
2278      docRelsXml = document['word/_rels/document.xml.rels'] 
2279       
2280      pictureTargets = getImagePaths(docRelsXml) 
2281      for target in pictureTargets: 
2282          if target.endswith('.emf') is True: 
2283              # .emf vector graphics picture was found. 
2284              return True 
2285          if target.endswith('wmf') is True: 
2286              return True 
2287       
2288       
2289      # TODO: check the object if no vector graphics picture is found. 
2290       
2291      # Normal jpg or png pictures may be inside the w:drawing-element. 
2292      #drawingElements = doc.getElementsByTagName('w:drawing') 
2293      #if len(drawingElements) > 0: 
2294      #    return True 
2295       
2296      return False 
2297