Package src :: Package inspectors :: Module docx_inspector

[frames] | no frames]

Module docx_inspector

source code

The module provides the methods for inspecting docx files.

Author: Vili Auvinen, Juho Tammela

Functions

[hide private]

_getStyleElementById(styleId, styleXml)
Gets a style element by the style id from the styles.xml.

source code

_getStyleElementByName(styleName, styleXml)
Gets a style element by a style name from styles.xml.

source code

_getBasedOnStyleId(styleName, styleXml)
Get the based-on style style id for a given style from styles.xml.

source code

_getStyleName(styleElement)
Get a style name of the style element.

source code

_getStyleNameById(styleId, styleXml)
Get the name of a style with a given style id.

source code

_getStyleIdByName(styleName, styleXml)
Get the id of a style with a given style name from styles.xml.

source code

_getParagraphStyleId(p)
Gets the style id of a paragraph element.

source code

_getThemeFont(themeXml, styleDefinitions, themeFont)
Gets a themefont from theme1.xml.

source code

_getStyleDefinitions(element, styleDefinitions)
Return style definitions of a given element.

source code

getStyle(document, requirementStyleName)
Gets all definitions of a style from document dictionary.

source code

_getCompleteStyleDefinitions(styleXml, styleName, themeXml)
Returns the style definition of the given style from style.xml and theme1.xml.

source code

_getElementValueWithinElement(elementTagName, element)
Gets the text content of the first element with a certain tag in the given DOM tree.

source code

_getElementWithinElement(element, elementTagName)
Gets the first child of an element with the given tag name.

source code

_getElementsWithinElement(element, elementTagName)
Gets the children of an element with the given tag name.

source code

_getTargetXmlFileByHeader(header, document)
Gets a header reference target xml-file as a DOM tree.

source code

_checkFrontPageHeadersAndFooters(references, document)
Goes through header or footer references and checks if there is any content in them.

source code

_checkAutomaticPageNumbering(section, headerReference, footerReference, document, errorIds, numStartKey)
Checks if a section has an automatic page numbering and gets the numbering format.

source code

_checkNameInHeaderOrFooter(reference, document)
Looks for text inside a header or footer and sees if the last modifier's name is in there.

source code

_getPgNumFormat(sectionPgNumType)
Gets the number format of the given section page number type.

source code

checkHeadersAndFooters(document)
Checks that the headers and footers of a document are made correctly.

source code

getParagraphElementsBySections(docXml, sectionName)
Get paragraph elements of the wanted section.

source code

getSectionElementsBySections(docXml, index=None)
Gets all the w:sectPr elements of a document or optionally the w:sectPr elements of a specific section.

source code

_areSectionsOverlapping(outerParagraphElements, innerParagraphElements, errorList, errorMsg, expectedResult)
Goes through two lists of paragraph elements checking if the same paragraph is in both lists.

source code

checkSections(document, errorList)
Goes through the section elements in the document checking that the sections are done properly.

source code

_checkPageProperties(allSectionProperties, pageProperties, tagName)
Goes through all section properties to see that they have coherent property values.

source code

_convertSizes(sizesDict)

source code

getPageMarginals(document)
Gets the document page marginals sizes.

source code

getPageSize(document)
Gets the document page sizes.

source code

_getTitle(coreXml)
Gets the title as set in document setting, None if not found.

source code

_getCreator(coreXml)
Gets the document creator as set in document setting, None if not found.

source code

_getLastModifier(coreXml)
Gets the document last modifier as set in document setting, None if not found.

source code

_getCreateDate(coreXml)
Gets the document creatin date as found in document setting, None if not found.

source code

_getLastModifiedDate(coreXml)
Gets the document last modified date as found in document setting, None if not found.

source code

_getRevision(coreXml)
Gets revision of the document as found in document setting, None if not found.

source code

_getTextFromParagraph(paragraph)
Gets the text content of <w:t>-elements from the given (paragraph) element.

source code

checkTocContent(document)
Checks if all of the headings created in the document are listed in the table of contents.

source code

checkTOC(document)
Check if table of contents is done correctly.

source code

checkCoverPage(document)
Checks if the front page is done correctly

source code

getRelsTargetByRId(rId, rels)
Returns the value of Target attribute of a Relationship element with the given id in a given rels file.

source code

getParentParagraph(element, tag='w:p')
Returns the parent <w:p>-element of a given element if there is one. source code

checkImages(document)
Check if there is an image in the document.

source code

getImagePaths(document)
Gets the image paths or the file names of the images used in the document.

source code

checkImageCaptions(document)
Checks if the next paragraph after a picture paragraph uses the caption style.

source code

checkStyleUsage(document, errorIdsAndPositions)
Checks that text paragraphs are using styles and that no manual style definitions are made.

source code

checkEndnotesAndFootnotes(document)
Checks if there is an endnote or a footnote in the document.

source code

checkCrossRefrenceToImageCaption(document)
Goes through images' captions looking for a reference.

source code

_getElementByAttributeValue(nodeList, attributeName, attributeValue)
Gets an element by an attribute value.

source code

_isStyleUsed(document, styleName)
Checks that a style is used in the document.

source code

checkHeadingNumbering(document, errorIdsAndPositions)
Checks the headings in the document.

source code

_getParagraphElementsByStyleId(docXml, styleId)
Gets all paragraph-elements in the document by a style id.

source code

_getParagraphElementsBySequentialStyleName(styleNamePrefix, styleXml, docXml)
Return all paragraph elements that use a style name with a sequential numbering.

source code

checkIndex(document)
Checks that the document has an automatically made index.

source code

checkIndexContent(document)
Checks that the document has a index that is not empty, and that the index entries are referenced somewhere in the document.

source code

checkDoubleWhitespaces(document)
Checks double whitespaces in the document.

source code

checkAsterisk(document)
Checks the *-character in the document.

source code

checkStringFromDocument(docXml, string)
Checks if a string is found in the text content of the document (in the w:t-elements).

source code

checkTabs(document)
Checks if the tabulator is used in the document.

source code

isParagraphEmpty(p, styleXml)
Checks if a paragraph is empty.

source code

checkEmptyParagraphs(document)
Finds all empty paragraphs in the document.

source code

checkList(document, listName='List')
Goes through all paragraph elements in the document looking for paragraphs that use some list style. source code

checkSpreadsheetChart(document)
Checks that the document has a chart copied from a spreadsheet document.

source code

checkSpreadsheetTable(document)
Checks that the document has a table copied from a spreadsheet document.

source code

checkPresentationGraphicsChart(document)
Checks that the document contains a chart pasted from PowerPoint as a vector graphics picture or as an object.

source code

Variables

[hide private]

__package__ = 'src.inspectors'

Function Details

[hide private]

_getStyleElementById(styleId, styleXml)

source code

Gets a style element by the style id from the styles.xml.

The style Id links a paragraph using a style in document.xml to the right style in styles.xml. Style id can be in different languages depending on what language the Word was that wrote the document.

Parameters:

styleId - The style id.
styleXml - styles.xml as a DOM tree.

Returns:

The style element with a given style id or None if no matching style element was found.

Note: XML example:

<w:p> <w:pPr> <w:pStyle w:val="Otsikko1"/> (This is the style id.) </w:pPr> <r> ... </r> </w:p>

_getStyleElementByName(styleName, styleXml)

source code

Gets a style element by a style name from styles.xml.

Style name is found in the styles.xml. A style has always the same name regardless of the language of the Word that wrote the document.

Parameters:

styleName - The style name.
styleXml - styles.xml as a DOM tree.

Returns:

The style-element with a given style name, or None if no matching style element was found.

Note: XML example:

<w:style w:type="paragraph" w:styleId="Otsikko1"> (This is the style id.) <w:name w:val="heading 1"/> (Here is the style name.) ... </w:style>

_getBasedOnStyleId(styleName, styleXml)

source code

Get the based-on style style id for a given style from styles.xml.

Parameters:

styleName - The style name of the style that's based-on style id is wanted.
styleXml - styles.xml as a DOM tree.

Returns:

The id of the based on style for a given style name, or None if there was no based on style.

_getStyleName(styleElement)

source code

Get a style name of the style element.

Parameters:

styleElement - The style element whose style name is wanted.

Returns:

The style name of a given style element.

_getStyleNameById(styleId, styleXml)

source code

Get the name of a style with a given style id.

Parameters:

styleId - The style id to be looked for.
styleXml - styles.xml as a DOM tree.

Returns:

The style name of the style with the correct style id, or None if it wasn't found.

_getStyleIdByName(styleName, styleXml)

source code

Get the id of a style with a given style name from styles.xml.

Parameters:

styleName - The style name to be looked for.
styleXml - styles.xml as a DOM tree.

Returns:

The style id of the style with the correct style name, or None if it wasn't found.

_getParagraphStyleId(p)

source code

Gets the style id of a paragraph element.

Parameters:

p - The paragraph element.

Returns:

The style id if it was found, otherwise returns None.

_getThemeFont(themeXml, styleDefinitions, themeFont)

source code

Gets a themefont from theme1.xml.

Parameters:

themeXml - theme1.xml as DOM tree.
styleDefinitions - The style definitions dict.
themeFont - The theme font. Should be either 'majorFont' or 'minorFont'.

Returns:

The style definitions with or without changes.

Note: XML example:

<a:fontScheme name="Office"> <a:majorFont> <a:latin typeface="Cambria"/> <a:ea typeface=""/> <a:cs typeface=""/> </a:majorFont> <a:minorFont> <a:latin typeface="Calibri"/> <a:ea typeface=""/> <a:cs typeface=""/> </a:minorFont> </a:fontScheme>

See Also: _getCompleteStyleDefinitions.

_getStyleDefinitions(element, styleDefinitions)

source code

Return style definitions of a given element. First checks if the element has any children and uses recursion if some are found. Next checks if the element has attributes.

If the attribute name is a key in the dict, stores the value of the attribute.
If the attribute name is 'w:val' and the element tag name is a key in the dict, stores the value of the attribute.

If the element tag name is a key in the dict and the element doesn't have any attributes or children, stores value '1' in the dict.

Parameters:

element - Style definitions are searched inside this element
styleDefinitions - A dict where the style definitions are stored. May contain tag names or attribute names and some default values.

Returns:

The style definitions in a dict.

getStyle(document, requirementStyleName)

source code

Gets all definitions of a style from document dictionary.

Converts twips to centimeters.

Returns:: A dict with all the style definitions of the one style with the translated keys to match return value odt_inspector's getStyle(). False, if the style was not found.

_getCompleteStyleDefinitions(styleXml, styleName, themeXml)

source code

Returns the style definition of the given style from style.xml and theme1.xml. Recursion used because the style can be based on some other style. In addition, the base style gets style definitions from the document defaults. Finally, some style definitions are not found in the XML file at all. These definitions use some default value which must be assumed.

Parameters:

styleXml - styles.xml-file as a DOM tree.
styleName - The name of the style (NOT the id)
themeXml - theme1.xml as DOM tree.

Returns:

Style definitions in a dict.

Note: XML example:

<w:style w:type="paragraph" w:default="1" w:styleId="Normaali"> <w:name w:val="Normal"/> <w:qFormat/> <w:rsid w:val="006B493C"/> <w:pPr> <w:spacing w:before="140" w:after="220" w:line="360" w:lineRule="auto"/> <w:ind w:left="567"/> <w:jc w:val="both"/> </w:pPr> <w:rPr> <w:rFonts w:ascii="Georgia" w:hAnsi="Georgia"/> <w:lang w:val="fi-FI"/> </w:rPr> </w:style>

See Also: _getStyleElementById and _getStyleElementByName for difference.

_getElementValueWithinElement(elementTagName, element)

source code

Gets the text content of the first element with a certain tag in the given DOM tree.

Returns:: The text value of the element, or None if something went wrong.

_getElementWithinElement(element, elementTagName)

source code

Gets the first child of an element with the given tag name.

Returns the element of a the given parent element with the given elementTagName.

Parameters:

element - The element whose children are searched.
elementTagName - The tag name of the wanted element.

Returns:

An element with the right tag name, or None if it wasn't found.

_getElementsWithinElement(element, elementTagName)

source code

Gets the children of an element with the given tag name.

Returns the element of the given parent element by the given elementTagName.

Parameters:

element - The element whose children are searched.
elementTagName - Tag name of the wanted elements.

Returns:

The list of elements with the right tag name, or None if none was found.

_checkFrontPageHeadersAndFooters(references, document)

source code

Goes through header or footer references and checks if there is any content in them.

Checks if there are headers or footers in the front page by looking for <w:t> tags. Even if there are references to headers or footers, they might be empty.

Parameters:

references - Header or footer references.

Returns:

The header or footer target XML file as a DOM tree, or None if no headers or footers were found.

_checkAutomaticPageNumbering(section, headerReference, footerReference, document, errorIds, numStartKey)

source code

Checks if a section has an automatic page numbering and gets the numbering format.

First goes through the section element and checks that the numbering starts at 1. Gets the section numbering of format definition. If it is defined, returns it. If a numbering format is not found in the section properties, it defaults to 'Standard'. If the numbering format is standard, checks the header and footer references for other numbering format definitions. The numbering format in the header or the footer reference is sometimes in <w:instrText> element inside the content of PAGE \* MERGEFORMAT.

Parameters:

section - The section element to be searched for.
headerReference - The current header of the section element as a DOM tree.
footerReference - The current footer of the section element as a DOM tree.
document - The document as a dict of DOM tree pairs.
errorIds - The dict for appending errors True/False.
numStartKey - The key for errorIds to append numbering start error.

Returns:

The page numbering as a string format, or False if there was no page numbering or the numbering was both in header and footer.

_checkNameInHeaderOrFooter(reference, document)

source code

Looks for text inside a header or footer and sees if the last modifier's name is in there.

Problem: sometimes we want to check that there is no name in the header or the footer. If a name is found but it's different from the last modifier's name, result is False, even though a name is in a header/footer. For now just tries to check that either the name of the last modifier or just some name was found.

Parameters:

reference - The header or footer XML file as a DOM tree.

Returns:

True if a name is found in the text, False otherwise.

_getPgNumFormat(sectionPgNumType)

source code

Gets the number format of the given section page number type.

Parameters:

sectionPgNumType - The given page number type element of the section

Returns:

The numbering format, defaults to 'Standard' if nothing else is defined.

checkHeadersAndFooters(document)

source code

Checks that the headers and footers of a document are made correctly.

Assumes that the document has three sections:

cover section
table of contents section or toc section
actual content section or text section

Returns:: Findings in the errorIds-dict as key-boolean pairs as described above.

See Also: checkSections method must pass in order to run this method

Notes:

Places findings in the errorIds-dict as key-boolean pairs:
'frontPage': was there headers or footers in the cover section.

'tocPageNumbering': is there a page numbering in the toc section.

'differentPageNumbering': is the page numbering different in the cover and text sections.

'nameInToc': is the last modifiers name in toc section header or footer.

'nameInText': is the last modifiers name in text section header or footer.

'pageNumbering': is there a page numbering in the text section.

'tocNumStart': does the toc section page numbering start at 1.

'textNumStart': does the text section page numbering start at 1.

'titlePg': is the Microsoft Office setting "Different first page" on.
XML example:
<w:pgNumType w:fmt="lowerRoman" w:start="1"/>)

<w:pgNumType w:start="1"/>

getParagraphElementsBySections(docXml, sectionName)

source code

Get paragraph elements of the wanted section. The page breaking section break elements changes section, continuous section brake elements don't change section.

The first list of the section elements is the cover section. The second list of the section elements is the table of contents-section. The third list of the section elements is the text section. The document has to have at least 3 sections.

Parameters:

docXml - The document.xml file as a DOM tree.
sectionName - The wanted section can be 'cover', 'toc' or 'text'.

Returns:

The list of the section elements.

getSectionElementsBySections(docXml, index=None)

source code

Gets all the w:sectPr elements of a document or optionally the w:sectPr elements of a specific section.

w:sectPr elements are stored in a two dimensional list. Continuous section breaks are appended to current outer list index. The page breaking section raises the outer list index.

Parameters:

index - The index of the outer pageSections list that is get. None by default.

Returns:

The two dimensional list of all w:sectPr elements if index is None. Otherwise returns the list at the given index.

_areSectionsOverlapping(outerParagraphElements, innerParagraphElements, errorList, errorMsg, expectedResult)

source code

Goes through two lists of paragraph elements checking if the same paragraph is in both lists.

Parameters:

outerParagraphElements - The outer paragraphlist to be searched for.
innerParagraphElements - The inner pagraphlist to be searched for.
errorList - The list for appending error messages.
errorMsg - The error message to be appended.
expectedResult - Boolean of the expected result.

Returns:

expectedResult changed or unchanged.

checkSections(document, errorList)

source code

Goes through the section elements in the document checking that the sections are done properly.

There must be at least three sections in the document. The cover page and the table of the contents cannot be in the same section. Also checks that the Microsoft Office Word setting "Different first page" is off.

Returns:: True if everything went well, False if something went terribly wrong or error list if an error was found and the checking could be completed.

_checkPageProperties(allSectionProperties, pageProperties, tagName)

source code

Goes through all section properties to see that they have coherent property values.

If the property value is the same in all section elements, the value is stored in pageProperties. If something is different between the sections, it's wrong and the page property is set False. For example, if two different section elements have different page top marginal, the property is set False.

Parameters:

allSectionProperties - All w:sectPr elements of the document.
pageProperties - the allowed page properties are {'top': None, 'right': None, 'bottom': None, 'left': None} or {'w': None, 'h': None}.
tagName - Tag name of the element whose properties are checked.

Returns:

pageProperties dict with coherent page values and incoherent values set as False.

getPageMarginals(document)

source code

Gets the document page marginals sizes.

Returns:: False if the marginals are not coherent, otherwise a dictionary containing the marginal sizes.

getPageSize(document)

source code

Gets the document page sizes.

Returns:: False if the page sizes are not coherent, otherwise a dictionary containing the page width and length.

_getTextFromParagraph(paragraph)

source code

Gets the text content of <w:t>-elements from the given (paragraph) element.

Returns:: the text content as a string.

checkTocContent(document)

source code

Checks if all of the headings created in the document are listed in the table of contents.

Returns:: True if toc matches the headings content, False otherwise.

checkTOC(document)

source code

Check if table of contents is done correctly. It has to have a page break before (and after) it.

Returns:: True if toc is made correctly, False otherwise.

See Also: checkTocContent -- calls the method if there's a table of contents to be found.

Note: XML example:

<w:p w:rsidR="004A16ED" w:rsidRDefault="004A16ED" w:rsidP="006158B0">

<w:pPr>

<w:pStyle w:val="Otsikko"/>

</w:pPr>

<w:r w:rsidRPr="006158B0">

<w:lastRenderedPageBreak/>

<w:t>SISALLYSLUETTELO</w:t>

</w:r>

</w:p>

<w:p w:rsidR="002274FC" w:rsidRDefault="00FA6E61">

<w:pPr>

<w:pStyle w:val="Sisluet1"/>

checkCoverPage(document)

source code

Checks if the front page is done correctly

Returns:: coverPageText dictionary containing True or False values.

getRelsTargetByRId(rId, rels)

source code

Returns the value of Target attribute of a Relationship element with the given id in a given rels file. The value of Target attribute can be for example a relative path to local XML files or images. It can also be a hyperlink.

Parameters:

rId - Id attribute value of a Relationship element.
rels - rels file as a DOM tree.

Returns:

The value of Target attribute if found.

getParentParagraph(element, tag=`'w:p'`)

source code

Returns the parent <w:p>-element of a given element if there is one.

Parameters:

element - The element whose parent <w:p> element is searched for.
tag - The parent tagname, defaults to 'w:p'.

Returns:

The parent element, or None if no parent is found.

checkImages(document)

source code

Check if there is an image in the document.

Returns:: True if even one image is found, False otherwise.

getImagePaths(document)

source code

Gets the image paths or the file names of the images used in the document.

Returns:: The image targets as strings in a list.

checkImageCaptions(document)

source code

Checks if the next paragraph after a picture paragraph uses the caption style.

Also checks that the caption contains an automatic field. Goes through all picture paragraphs.

Returns:: True if all images have captions, False otherwise.

checkStyleUsage(document, errorIdsAndPositions)

source code

Checks that text paragraphs are using styles and that no manual style definitions are made.

Goes through all paragraph-elements in a document looking for <w:pStyle>-elements. Gets the style definitions to see if there are manual changes.

Parameters:

errorIdsAndPositions - A dict for error strings. Should contain keys 'manualChanges' and 'styleNotUsed'.

Returns:

True if nothing was found, False if even one error was found.

Note: Exception:

Automatically generated table on contents can contain "manual" style definitions. The <w:sectPr> elements within paragraph elements are skipped also.

checkEndnotesAndFootnotes(document)

source code

Checks if there is an endnote or a footnote in the document.

Looks for w:endnoteReference and w:footnoteReference elements.

Returns:: True if an endnote or a footnote is found, False otherwise.

checkCrossRefrenceToImageCaption(document)

source code

Goes through images' captions looking for a reference. Then checks if the caption is referenced somewhere.

Returns:: True if a cross reference is found, False otherwise.

_getElementByAttributeValue(nodeList, attributeName, attributeValue)

source code

Gets an element by an attribute value.

Parameters:

nodeList - A list of elements to be searched for.
attributeName - The name of the wanted attribute.
attributeValue - The wanted value of the attribute.

Returns:

The element, if it has an attribute with the wanted value, None otherwise.

_isStyleUsed(document, styleName)

source code

Checks that a style is used in the document.

Parameters:

styleName - The name of the style looked for.

Returns:

True if the style is used, False otherwise.

checkHeadingNumbering(document, errorIdsAndPositions)

source code

Checks the headings in the document.

Goes through the heading styles used in the document checking that they use a multilevel numbering, the numbering is done correctly using styles and that the numbering is connected to other heading styles.

Gets all the heading styles used in the document. Searches for the heading's numbering definition reference in styles.xml. Next searches the associated numbering definition in numbering.xml. Next searches the correct numbering level definition associated to the heading. Checks that the numbering is multilevel and done correctly using the heading styles.

Parameters:

errorIdsAndPositions - A dict for appending errors in key - stringlist pairs. Should contain the following keys:
- 'manualNumbering' -- numbering is done manually somehow.
- 'styleNotUsed' -- an expected heading style is not used.
- 'differentNumbering' -- some heading style is using different numbering than some other heading styles.
- 'notMultilevel' -- the numbering is not multilevel.
- 'outlineLvl' -- the outline of a heading style is not correct.
- 'numStart' -- the numbering doesn't start at 1.
- 'numWrong' -- the numbering is somehow not done with styles.
- 'numFormat' -- the numbering format is not correct.
- 'notSequential' -- heading styles are not used correctly in a row for example heading 3 is used after heading 1.

Note: XML example:

styles.xml:

<w:style w:type="paragraph" w:styleId="Heading2"> - Heading 2 style definition

<w:name w:val="heading 2"/>

<w:pPr>

<w:numPr>

<w:ilvl w:val="1"/> - Numbering Level Reference

<w:numId w:val="1"/> - Numbering Definition Instance Reference

</w:numPr>

<w:outlineLvl w:val="1"/>

</w:pPr>

</w:style>

numbering.xml:

<w:abstractNum w:abstractNumId="0"> - Abstract Numbering Definition

<w:multiLevelType w:val="multilevel"/> - Abstract Numbering Definition Type

<w:lvl w:ilvl="0"> - </w:lvl> - Numbering Level Definition

<w:lvl w:ilvl="1"> - Numbering Level Definition

<w:start w:val="1"/> - Starting Value

<w:numFmt w:val="decimal"/> - Numbering Format

<w:pStyle w:val="Heading2"/> - Paragraph Style's Associated Numbering Level

<w:lvlText w:val="%1.%2"/> - Numbering Level Text

<w:lvlJc w:val="left"/> - Justification

<w:pPr> - Numbering Level Associated Paragraph Properties

<w:ind w:left="576" w:hanging="576"/>

</w:pPr>

</w:lvl>

</w:abstractNum>

<w:num w:numId="1"> - Numbering Definition Instance

<w:abstractNumId w:val="0"/> - Abstract Numbering Definition Reference

</w:num>

_getParagraphElementsBySequentialStyleName(styleNamePrefix, styleXml, docXml)

source code

Return all paragraph elements that use a style name with a sequential numbering.

Gets all paragraphs that use styles with stylenames for example heading 1, heading 2, etc or index 1, index 2, etc.

Parameters:

styleNamePrefix - The prefix of the sequential style name.

checkIndex(document)

source code

Checks that the document has an automatically made index.

Returns:: False if an index is missing, '2' if index is not automatically made and True if everything was OK.

checkIndexContent(document)

source code

Checks that the document has a index that is not empty, and that the index entries are referenced somewhere in the document.

First gets all the index styles' definitions from styles.xml and finds paragraphs using the styles in the document.xml. Checks that there is a field code element indicating that the index is generated automatically. Collects the content of the index and checks it isn't empty. Finds references to the index entries and matches them to the index content.

Returns:: '3' if the index is empty, '4' if the content does not match with the document and True if everything went OK.

Note: XML example:

Index example:

<w:p w:rsidR="002F2A09" w:rsidRDefault="00CA51D5">

<w:r>

<w:fldChar w:fldCharType="begin"/>

</w:r>

<w:r>

<w:instrText xml:space="preserve"> INDEX \c "2" \z "1035" </w:instrText>

</w:r>

<w:r>

<w:fldChar w:fldCharType="separate"/>

</w:r>

</w:p>

<w:p w:rsidR="002F2A09" w:rsidRDefault="002F2A09">

<w:pPr>

<w:pStyle w:val="Index1"/>

<w:tabs>

<w:tab w:val="right" w:leader="dot" w:pos="3950"/>

</w:tabs>

</w:pPr>

<w:r>

<w:t>Index entry level 1</w:t>

</w:r>

</w:p>

Reference example:

<w:r w:rsidR="00B27B47">

<w:instrText xml:space="preserve"> XE "</w:instrText>

</w:r>

<w:r w:rsidR="00B27B47" w:rsidRPr="00B27B47">

<w:instrText>Level 1 entry</w:instrText>

</w:r>

<w:r w:rsidR="00B27B47" w:rsidRPr="00B27B47">

<w:instrText>:</w:instrText>

</w:r>

<w:r w:rsidR="00B27B47" w:rsidRPr="0011587C">

<w:instrText>Level 2 entry</w:instrText>

</w:r>

checkDoubleWhitespaces(document)

source code

Checks double whitespaces in the document.

Returns:: The amount of occurrences of the double whitespaces found in the document, False otherwise.

checkAsterisk(document)

source code

Checks the *-character in the document.

Returns:: The amount of occurrences of the asterisks found in the document, False otherwise.

checkStringFromDocument(docXml, string)

source code

Checks if a string is found in the text content of the document (in the w:t-elements). If string is found, returns how many occurences were found in a paragraph.

Returns:: The amount of occurrences of the string is found in the document, False otherwise.

checkTabs(document)

source code

Checks if the tabulator is used in the document.

Returns:: The amount of the tabulator occurrences found in the document, False if none was found.

Note: Exceptions:

automatically generated table of contents and index contain tabulators.
before an automatically generated index there is a paragraph-element with <instrText>-element and a <tab>-element.

isParagraphEmpty(p, styleXml)

source code

Checks if a paragraph is empty.

Parameters:

p - The paragraph element under inspection.

Returns:

False if the paragraph is not empty, True if it is empty.

Note: Expections:

Picture in the document produces an empty paragraph. Empty table cell produces an empty paragraph. A table produces an empty paragraph right after the table. Objects and graphics produce an empty paragraph. ...

checkEmptyParagraphs(document)

source code

Finds all empty paragraphs in the document.

Returns:: amount of empty paragraph occurrences in the document, False if none was found.

Note: Expections:

Picture in the document produces an empty paragraph. Empty table cell produces an empty paragraph. A table produces an empty paragraph right after the table. ...?

checkList(document, listName=`'List'`)

source code

Goes through all paragraph elements in the document looking for paragraphs that use some list style.

Parameters:

listName - The list stylename we want to check. Defaults to 'List', which finds list styles such as 'List', 'List Bullet', 'List Numbered'.

Returns:

True, if a list style is used in the document, False otherwise.

checkSpreadsheetChart(document)

source code

Checks that the document has a chart copied from a spreadsheet document. The Chart must be pasted as a link.

checkSpreadsheetTable(document)

source code

Checks that the document has a table copied from a spreadsheet document. For now checks that the table is pasted as a link.

checkPresentationGraphicsChart(document)

source code

Checks that the document contains a chart pasted from PowerPoint as a vector graphics picture or as an object. Doesn't really know if the picture or object is actually from PowerPoint!

Module docx_inspector

_getStyleElementById(styleId, styleXml)

_getStyleElementByName(styleName, styleXml)

_getBasedOnStyleId(styleName, styleXml)

_getStyleName(styleElement)

_getStyleNameById(styleId, styleXml)

_getStyleIdByName(styleName, styleXml)

_getParagraphStyleId(p)

_getThemeFont(themeXml, styleDefinitions, themeFont)

_getStyleDefinitions(element, styleDefinitions)

getStyle(document, requirementStyleName)

_getCompleteStyleDefinitions(styleXml, styleName, themeXml)

_getElementValueWithinElement(elementTagName, element)

_getElementWithinElement(element, elementTagName)

_getElementsWithinElement(element, elementTagName)

_checkFrontPageHeadersAndFooters(references, document)

_checkAutomaticPageNumbering(section, headerReference, footerReference, document, errorIds, numStartKey)

_checkNameInHeaderOrFooter(reference, document)

_getPgNumFormat(sectionPgNumType)

checkHeadersAndFooters(document)

getParagraphElementsBySections(docXml, sectionName)

getSectionElementsBySections(docXml, index=None)

_areSectionsOverlapping(outerParagraphElements, innerParagraphElements, errorList, errorMsg, expectedResult)

checkSections(document, errorList)

_checkPageProperties(allSectionProperties, pageProperties, tagName)

getPageMarginals(document)

getPageSize(document)

_getTextFromParagraph(paragraph)

checkTocContent(document)

checkTOC(document)

checkCoverPage(document)

getRelsTargetByRId(rId, rels)

getParentParagraph(element, tag='w:p')

checkImages(document)

getImagePaths(document)

checkImageCaptions(document)

checkStyleUsage(document, errorIdsAndPositions)

checkEndnotesAndFootnotes(document)

checkCrossRefrenceToImageCaption(document)

_getElementByAttributeValue(nodeList, attributeName, attributeValue)

_isStyleUsed(document, styleName)

checkHeadingNumbering(document, errorIdsAndPositions)

_getParagraphElementsBySequentialStyleName(styleNamePrefix, styleXml, docXml)

checkIndex(document)

checkIndexContent(document)

checkDoubleWhitespaces(document)

checkAsterisk(document)

checkStringFromDocument(docXml, string)

checkTabs(document)

isParagraphEmpty(p, styleXml)

checkEmptyParagraphs(document)

checkList(document, listName='List')

checkSpreadsheetChart(document)

checkSpreadsheetTable(document)

checkPresentationGraphicsChart(document)

getParentParagraph(element, tag=`'w:p'`)

checkList(document, listName=`'List'`)