How to extract pictures from word by python

Method 1: extract the image by decompressing the. docx file

Preface

The. docx file is actually a compressed file. When we unzip a. docx file directly, we can see the following directory

The picture we are looking for is in the word/media directory, as shown in the figure

Therefore, to extract the image in word, you need to extract the. docx file, then extract the image from the media file, and then delete the extracted file

Code

import os
import shutil
import zipfile
 
 
def get_pictures(word_path, result_path):
    """
    //Get all pictures in word
    :param word_path: word file
    :param result_path: Results directory, no manual creation required
    :return: None or generator´╝îNone: word There are no pictures in it, generator: Path for each picture
    """
    tmp_path = f'{os.path.splitext(word_path)[0]}'
    # Rename and decompress the source file after copying it
    splitext = os.path.splitext(word_path)
    zip_path = shutil.copy(word_path, f'{splitext[0]}_new{splitext[1]}')
    with zipfile.ZipFile(zip_path, 'r') as f:
        for file in f.namelist():
            f.extract(file, tmp_path)
    os.remove(zip_path)
    # Note: the word picture is in the word/media directory in the zip file
    pic_path = os.path.join(tmp_path, 'word/media')
    if not os.path.exists(pic_path):
        shutil.rmtree(tmp_path)
        return 'no pictures found'
    pictures = os.listdir(pic_path)
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    for picture in pictures:
        # Generate the name of the picture according to the file name of word
        word_name = os.path.splitext(word_path)[0]
        if os.sep in word_name:
            new_name = word_name.split('\\')[-1]
        else:
            new_name = word_name.split('/')[-1]
        picture_name = f'{new_name}_{picture}'
        shutil.copy(os.path.join(pic_path, picture), os.path.join(result_path, picture_name))
 
    shutil.rmtree(tmp_path)
    return (os.path.join(result_path, pic) for pic in os.listdir(result_path))

Method 2: use the three-way docx library to extract pictures (recommended)

import docx
import os


def get_pictures(word_path, result_path):
    """
    //Extract pictures from word documents
    :param word_path: word file
    :param result_path: Results directory
    :return:
    """
    doc = docx.Document(word_path)
    dict_rel = doc.part._rels
    for rel in dict_rel:
        rel = dict_rel[rel]
        if "image" in rel.target_ref:
            if not os.path.exists(result_path):
                os.makedirs(result_path)
            img_name = re.findall("/(.*)", rel.target_ref)[0]
            word_name = os.path.splitext(word_path)[0]
            # print(os.sep)
            if os.sep in word_name:
                new_name = word_name.split('\\')[-1]
            else:
                new_name = word_name.split('/')[-1]
            img_name = f'{new_name}_{img_name}'
            with open(f'{result_path}/{img_name}', "wb") as f:
                f.write(rel.target_part.blob)

For more RPA knowledge, please visit yisaiqi community: https://support.i-search.com.cn

Posted on Tue, 21 Apr 2020 07:30:15 -0700 by sycoj0ker