diff --git a/src/docx/opc/phys_pkg.py b/src/docx/opc/phys_pkg.py index 5ec32237c..3514d3167 100644 --- a/src/docx/opc/phys_pkg.py +++ b/src/docx/opc/phys_pkg.py @@ -1,7 +1,8 @@ """Provides a general interface to a `physical` OPC package, such as a zip file.""" import os -from zipfile import ZIP_DEFLATED, ZipFile, is_zipfile +import zlib +from zipfile import ZIP_DEFLATED, BadZipFile, ZipFile, is_zipfile from docx.opc.exceptions import PackageNotFoundError from docx.opc.packuri import CONTENT_TYPES_URI @@ -73,14 +74,24 @@ class _ZipPkgReader(PhysPkgReader): def __init__(self, pkg_file): super(_ZipPkgReader, self).__init__() - self._zipf = ZipFile(pkg_file, "r") + try: + self._zipf = ZipFile(pkg_file, "r") + except BadZipFile as e: + raise PackageNotFoundError("Package is not a valid zip file: %s" % e) from e def blob_for(self, pack_uri): """Return blob corresponding to `pack_uri`. Raises |ValueError| if no matching member is present in zip archive. + Raises |PackageNotFoundError| if the zip entry cannot be read due to corruption, + truncation, or encryption. """ - return self._zipf.read(pack_uri.membername) + try: + return self._zipf.read(pack_uri.membername) + except (BadZipFile, zlib.error, EOFError, RuntimeError) as e: + raise PackageNotFoundError( + "Package member '%s' could not be read: %s" % (pack_uri.membername, e) + ) from e def close(self): """Close the zip archive, releasing any resources it is using.""" diff --git a/tests/opc/test_phys_pkg.py b/tests/opc/test_phys_pkg.py index 6de0d868b..3e8e97c55 100644 --- a/tests/opc/test_phys_pkg.py +++ b/tests/opc/test_phys_pkg.py @@ -2,7 +2,9 @@ import hashlib import io -from zipfile import ZIP_DEFLATED, ZipFile +import struct +import zlib +from zipfile import ZIP_DEFLATED, BadZipFile, ZipFile import pytest @@ -70,6 +72,50 @@ def it_raises_when_pkg_path_is_not_a_package(self): class DescribeZipPkgReader: + def it_raises_PackageNotFoundError_when_stream_is_not_a_zip(self): + with pytest.raises(PackageNotFoundError, match="not a valid zip file"): + _ZipPkgReader(io.BytesIO(b"not a zip file")) + + def it_raises_PackageNotFoundError_when_blob_has_bad_crc(self): + """BadZipFile (CRC mismatch) from ZipFile.read() is wrapped.""" + # Build a zip with a large enough payload, then flip a byte in the + # middle of the compressed data to cause a CRC mismatch at read time. + buf = io.BytesIO() + with ZipFile(buf, "w", compression=ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", b"" * 50) + raw = bytearray(buf.getvalue()) + sig_pos = raw.find(b"PK\x03\x04") + fname_len = struct.unpack_from("" * 1000) + raw = bytearray(buf.getvalue()) + sig_pos = raw.find(b"PK\x03\x04") + fname_len = struct.unpack_from("