From 0abaa6634d2a20fdec6d281556b6adf33fcfc9bd Mon Sep 17 00:00:00 2001 From: Hal De Date: Sat, 25 Apr 2020 20:19:59 +0300 Subject: [PATCH] Support for manipulating app.xml --- docx/__init__.py | 3 + docx/document.py | 8 ++ docx/opc/appprops.py | 92 +++++++++++++++ docx/opc/constants.py | 6 + docx/opc/oxml.py | 6 + docx/opc/package.py | 21 ++++ docx/opc/parts/appprops.py | 51 ++++++++ docx/oxml/__init__.py | 3 + docx/oxml/appprops.py | 235 +++++++++++++++++++++++++++++++++++++ docx/oxml/ns.py | 1 + docx/parts/document.py | 8 ++ 11 files changed, 434 insertions(+) create mode 100644 docx/opc/appprops.py create mode 100644 docx/opc/parts/appprops.py create mode 100644 docx/oxml/appprops.py diff --git a/docx/__init__.py b/docx/__init__.py index 4dae2946b..e142129a9 100644 --- a/docx/__init__.py +++ b/docx/__init__.py @@ -10,6 +10,7 @@ from docx.opc.constants import CONTENT_TYPE as CT, RELATIONSHIP_TYPE as RT from docx.opc.part import PartFactory from docx.opc.parts.coreprops import CorePropertiesPart +from docx.opc.parts.appprops import AppPropertiesPart from docx.parts.document import DocumentPart from docx.parts.hdrftr import FooterPart, HeaderPart @@ -26,6 +27,7 @@ def part_class_selector(content_type, reltype): PartFactory.part_class_selector = part_class_selector +PartFactory.part_type_for[CT.OPC_APP_PROPERTIES] = AppPropertiesPart PartFactory.part_type_for[CT.OPC_CORE_PROPERTIES] = CorePropertiesPart PartFactory.part_type_for[CT.WML_DOCUMENT_MAIN] = DocumentPart PartFactory.part_type_for[CT.WML_FOOTER] = FooterPart @@ -36,6 +38,7 @@ def part_class_selector(content_type, reltype): del ( CT, + AppPropertiesPart, CorePropertiesPart, DocumentPart, FooterPart, diff --git a/docx/document.py b/docx/document.py index 6493c458b..0bd49f690 100644 --- a/docx/document.py +++ b/docx/document.py @@ -101,6 +101,14 @@ def core_properties(self): """ return self._part.core_properties + @property + def app_properties(self): + """ + A |AppProperties| object providing read/write access to the app + properties of this document. + """ + return self._part.app_properties + @property def inline_shapes(self): """ diff --git a/docx/opc/appprops.py b/docx/opc/appprops.py new file mode 100644 index 000000000..a25478171 --- /dev/null +++ b/docx/opc/appprops.py @@ -0,0 +1,92 @@ +# encoding: utf-8 + +""" +The :mod:`pptx.packaging` module coheres around the concerns of reading and +writing presentations to and from a .pptx file. +""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from lxml import etree + + +class AppProperties(object): + """ + Corresponds to part named ``/docProps/app.xml``, containing the app + document properties for this document package. + """ + def __init__(self, element): + self._element = element + + @property + def template(self): + return self._element.template_text + + @template.setter + def template(self, value): + self._element.template_text = value + + @property + def total_time(self): + return self._element.total_time_text + + @total_time.setter + def total_time(self, value): + self._element.total_time_text = value + + @property + def pages(self): + return self._element.pages_text + + @pages.setter + def pages(self, value): + self._element.pages_text = value + + @property + def words(self): + return self._element.words_text + + @words.setter + def words(self, value): + self._element.words_text = value + + @property + def characters(self): + return self._element.characters_text + + @characters.setter + def characters(self, value): + self._element.characters_text = value + + @property + def lines(self): + return self._element.lines_text + + @lines.setter + def lines(self, value): + self._element.lines_text = value + + @property + def company(self): + return self._element.company_text + + @company.setter + def company(self, value): + self._element.company_text = value + + @property + def app_version(self): + return self._element.app_version_text + + @app_version.setter + def app_version(self, value): + self._element.app_version_text = value + + @property + def paragraphs(self): + return self._element.paragraphs_text + + @paragraphs.setter + def paragraphs(self, value): + self._element.paragraphs_text = value diff --git a/docx/opc/constants.py b/docx/opc/constants.py index b90aa394a..6d6f124dd 100644 --- a/docx/opc/constants.py +++ b/docx/opc/constants.py @@ -77,6 +77,9 @@ class CONTENT_TYPE(object): OPC_CORE_PROPERTIES = ( 'application/vnd.openxmlformats-package.core-properties+xml' ) + OPC_APP_PROPERTIES = ( + 'application/vnd.openxmlformats-officedocument.extended-properties+xml' + ) OPC_DIGITAL_SIGNATURE_CERTIFICATE = ( 'application/vnd.openxmlformats-package.digital-signature-certificat' 'e' @@ -412,6 +415,9 @@ class RELATIONSHIP_TYPE(object): 'http://schemas.openxmlformats.org/package/2006/relationships/metada' 'ta/core-properties' ) + APP_PROPERTIES = ( + 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties' + ) CUSTOM_PROPERTIES = ( 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' '/custom-properties' diff --git a/docx/opc/oxml.py b/docx/opc/oxml.py index 494b31dca..c8832fa24 100644 --- a/docx/opc/oxml.py +++ b/docx/opc/oxml.py @@ -21,6 +21,7 @@ nsmap = { 'ct': NS.OPC_CONTENT_TYPES, + 'ap': NS.OPC_CONTENT_TYPES, 'pr': NS.OPC_RELATIONSHIPS, 'r': NS.OFC_RELATIONSHIPS, } @@ -287,6 +288,11 @@ def overrides(self): ct_namespace['Override'] = CT_Override ct_namespace['Types'] = CT_Types +ap_namespace = element_class_lookup.get_namespace(nsmap['ap']) +ap_namespace['Default'] = CT_Default +ap_namespace['Override'] = CT_Override +ap_namespace['Types'] = CT_Types + pr_namespace = element_class_lookup.get_namespace(nsmap['pr']) pr_namespace['Relationship'] = CT_Relationship pr_namespace['Relationships'] = CT_Relationships diff --git a/docx/opc/package.py b/docx/opc/package.py index 7ba87bab5..cd9331a94 100644 --- a/docx/opc/package.py +++ b/docx/opc/package.py @@ -8,6 +8,7 @@ from docx.opc.packuri import PACKAGE_URI, PackURI from docx.opc.part import PartFactory from docx.opc.parts.coreprops import CorePropertiesPart +from docx.opc.parts.appprops import AppPropertiesPart from docx.opc.pkgreader import PackageReader from docx.opc.pkgwriter import PackageWriter from docx.opc.rel import Relationships @@ -41,6 +42,14 @@ def core_properties(self): """ return self._core_properties_part.core_properties + @property + def app_properties(self): + """ + |AppProperties| object providing read/write access to the + App properties for this document. + """ + return self._app_properties_part.app_properties + def iter_rels(self): """ Generate exactly one reference to each relationship in the package by @@ -184,6 +193,18 @@ def _core_properties_part(self): self.relate_to(core_properties_part, RT.CORE_PROPERTIES) return core_properties_part + @property + def _app_properties_part(self): + """ + |AppPropertiesPart| object related to this package. Creates + a default app properties part if one is not present (not common). + """ + try: + return self.part_related_by(RT.APP_PROPERTIES) + except KeyError: + app_properties_part = AppPropertiesPart.default(self) + self.relate_to(app_properties_part, RT.CORE_PROPERTIES) + return app_properties_part class Unmarshaller(object): """Hosts static methods for unmarshalling a package from a |PackageReader|.""" diff --git a/docx/opc/parts/appprops.py b/docx/opc/parts/appprops.py new file mode 100644 index 000000000..0775f60d8 --- /dev/null +++ b/docx/opc/parts/appprops.py @@ -0,0 +1,51 @@ +# encoding: utf-8 + +""" +App properties part, corresponds to ``/docProps/app.xml`` part in package. +""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) + +from datetime import datetime + +from ..constants import CONTENT_TYPE as CT +from ..appprops import AppProperties +from ...oxml.appprops import CT_AppProperties +from ..packuri import PackURI +from ..part import XmlPart + + +class AppPropertiesPart(XmlPart): + """ + Corresponds to part named ``/docProps/app.xml``, containing the app + document properties for this document package. + """ + @classmethod + def default(cls, package): + """ + Return a new |AppPropertiesPart| object initialized with default + values for its base properties. + """ + app_properties_part = cls._new(package) + app_properties = app_properties_part.app_properties + app_properties.app_version = '1.0' + return app_properties_part + + @property + def app_properties(self): + """ + A |AppProperties| object providing read/write access to the app + properties contained in this app properties part. + """ + return AppProperties(self.element) + + @classmethod + def _new(cls, package): + partname = PackURI('/docProps/app.xml') + content_type = CT.OPC_APP_PROPERTIES + appProperties = CT_AppProperties.new() + return AppPropertiesPart( + partname, content_type, appProperties, package + ) diff --git a/docx/oxml/__init__.py b/docx/oxml/__init__.py index 093c1b45b..0925cde41 100644 --- a/docx/oxml/__init__.py +++ b/docx/oxml/__init__.py @@ -72,6 +72,9 @@ def OxmlElement(nsptag_str, attrs=None, nsdecls=None): from .coreprops import CT_CoreProperties # noqa register_element_cls('cp:coreProperties', CT_CoreProperties) +from .appprops import CT_AppProperties # noqa +register_element_cls('ap:Properties', CT_AppProperties) + from .document import CT_Body, CT_Document # noqa register_element_cls('w:body', CT_Body) register_element_cls('w:document', CT_Document) diff --git a/docx/oxml/appprops.py b/docx/oxml/appprops.py new file mode 100644 index 000000000..70d2df5a8 --- /dev/null +++ b/docx/oxml/appprops.py @@ -0,0 +1,235 @@ +# encoding: utf-8 + +"""Custom element classes for app properties-related XML elements""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) + +import re + +from datetime import datetime, timedelta + +from docx.compat import is_string +from docx.oxml import parse_xml +from docx.oxml.ns import nsdecls, qn +from docx.oxml.xmlchemy import BaseOxmlElement, ZeroOrOne + + +class CT_AppProperties(BaseOxmlElement): + """ + ```` element, the root element of the App Properties + part stored as ``/docProps/app.xml``. Implements many of the Dublin App + document metadata elements. String elements resolve to an empty string + ('') if the element is not present in the XML. String elements are + limited in length to 255 unicode characters. + """ + template = ZeroOrOne('ap:Template', successors=()) + totaltime = ZeroOrOne('ap:TotalTime', successors=()) + pages = ZeroOrOne('ap:Pages', successors=()) + words = ZeroOrOne('ap:Words', successors=()) + characters = ZeroOrOne('ap:Characters', successors=()) + lines = ZeroOrOne('ap:Lines', successors=()) + company = ZeroOrOne('ap:Company', successors=()) + paragraphs = ZeroOrOne('ap:Paragraphs', successors=()) + version = ZeroOrOne('ap:AppVersion', successors=()) + + _appProperties_tmpl = ( + '\n' % nsdecls('ap') + ) + + @classmethod + def new(cls): + xml = cls._appProperties_tmpl + appProperties = parse_xml(xml) + return appProperties + + @property + def template_text(self): + return self._text_of_element('template') + + @template_text.setter + def template_text(self, value): + self._set_element_text('template', value) + + @property + def total_time_text(self): + return self._text_of_element('totaltime') + + @total_time_text.setter + def total_time_text(self, value): + self._set_element_text('totaltime', value) + + @property + def pages_text(self): + return self._text_of_element('pages') + + @pages_text.setter + def pages_text(self, value): + self._set_element_text('pages', value) + + @property + def words_text(self): + return self._text_of_element('words') + + @words_text.setter + def words_text(self, value): + self._set_element_text('words', value) + + @property + def characters_text(self): + return self._text_of_element('characters') + + @characters_text.setter + def characters_text(self, value): + self._set_element_text('characters', value) + + @property + def lines_text(self): + return self._text_of_element('lines') + + @lines_text.setter + def lines_text(self, value): + self._set_element_text('lines', value) + + @property + def company_text(self): + return self._text_of_element('company') + + @company_text.setter + def company_text(self, value): + self._set_element_text('company', value) + + @property + def paragraphs_text(self): + return self._text_of_element('paragraphs') + + @paragraphs_text.setter + def paragraphs_text(self, value): + self._set_element_text('paragraphs', value) + + @property + def app_version_text(self): + return self._text_of_element('version') + + @app_version_text.setter + def app_version(self, value): + self._set_element_text('version', value) + + def _datetime_of_element(self, property_name): + element = getattr(self, property_name) + if element is None: + return None + datetime_str = element.text + try: + return self._parse_W3CDTF_to_datetime(datetime_str) + except ValueError: + # invalid datetime strings are ignored + return None + + def _get_or_add(self, prop_name): + """ + Return element returned by 'get_or_add_' method for *prop_name*. + """ + get_or_add_method_name = 'get_or_add_%s' % prop_name + get_or_add_method = getattr(self, get_or_add_method_name) + element = get_or_add_method() + return element + + @classmethod + def _offset_dt(cls, dt, offset_str): + """ + Return a |datetime| instance that is offset from datetime *dt* by + the timezone offset specified in *offset_str*, a string like + ``'-07:00'``. + """ + match = cls._offset_pattern.match(offset_str) + if match is None: + raise ValueError( + "'%s' is not a valid offset string" % offset_str + ) + sign, hours_str, minutes_str = match.groups() + sign_factor = -1 if sign == '+' else 1 + hours = int(hours_str) * sign_factor + minutes = int(minutes_str) * sign_factor + td = timedelta(hours=hours, minutes=minutes) + return dt + td + + _offset_pattern = re.compile(r'([+-])(\d\d):(\d\d)') + + @classmethod + def _parse_W3CDTF_to_datetime(cls, w3cdtf_str): + # valid W3CDTF date cases: + # yyyy e.g. '2003' + # yyyy-mm e.g. '2003-12' + # yyyy-mm-dd e.g. '2003-12-31' + # UTC timezone e.g. '2003-12-31T10:14:55Z' + # numeric timezone e.g. '2003-12-31T10:14:55-08:00' + templates = ( + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%d', + '%Y-%m', + '%Y', + ) + # strptime isn't smart enough to parse literal timezone offsets like + # '-07:30', so we have to do it ourselves + parseable_part = w3cdtf_str[:19] + offset_str = w3cdtf_str[19:] + dt = None + for tmpl in templates: + try: + dt = datetime.strptime(parseable_part, tmpl) + except ValueError: + continue + if dt is None: + tmpl = "could not parse W3CDTF datetime string '%s'" + raise ValueError(tmpl % w3cdtf_str) + if len(offset_str) == 6: + return cls._offset_dt(dt, offset_str) + return dt + + def _set_element_datetime(self, prop_name, value): + """ + Set date/time value of child element having *prop_name* to *value*. + """ + if not isinstance(value, datetime): + tmpl = ( + "property requires object, got %s" + ) + raise ValueError(tmpl % type(value)) + element = self._get_or_add(prop_name) + dt_str = value.strftime('%Y-%m-%dT%H:%M:%SZ') + element.text = dt_str + if prop_name in ('created', 'modified'): + # These two require an explicit 'xsi:type="dcterms:W3CDTF"' + # attribute. The first and last line are a hack required to add + # the xsi namespace to the root element rather than each child + # element in which it is referenced + self.set(qn('xsi:foo'), 'bar') + element.set(qn('xsi:type'), 'dcterms:W3CDTF') + del self.attrib[qn('xsi:foo')] + + def _set_element_text(self, prop_name, value): + """Set string value of *name* property to *value*.""" + if not is_string(value): + value = str(value) + + if len(value) > 255: + tmpl = ( + "exceeded 255 char limit for property, got:\n\n'%s'" + ) + raise ValueError(tmpl % value) + element = self._get_or_add(prop_name) + element.text = value + + def _text_of_element(self, property_name): + """ + Return the text in the element matching *property_name*, or an empty + string if the element is not present or contains no text. + """ + element = getattr(self, property_name) + if element is None: + return '' + if element.text is None: + return '' + return element.text diff --git a/docx/oxml/ns.py b/docx/oxml/ns.py index 6b0861284..8f062c501 100644 --- a/docx/oxml/ns.py +++ b/docx/oxml/ns.py @@ -9,6 +9,7 @@ nsmap = { "a": "http://schemas.openxmlformats.org/drawingml/2006/main", + "ap":"http://schemas.openxmlformats.org/officeDocument/2006/extended-properties", "c": "http://schemas.openxmlformats.org/drawingml/2006/chart", "cp": "http://schemas.openxmlformats.org/package/2006/metadata/core-properties", "dc": "http://purl.org/dc/elements/1.1/", diff --git a/docx/parts/document.py b/docx/parts/document.py index 59d0b7a71..cac7ab457 100644 --- a/docx/parts/document.py +++ b/docx/parts/document.py @@ -44,6 +44,14 @@ def core_properties(self): """ return self.package.core_properties + @property + def app_properties(self): + """ + A |AppProperties| object providing read/write access to the app + properties of this document. + """ + return self.package.app_properties + @property def document(self): """