【无标题】poi-ooxml解析word

原创已于 2025-04-25 11:35:47 修改

· 173 阅读

0 ·

版权

文章标签：

#word #python #开发语言 #java #ruoyi

于 2025-04-25 11:34:11 首次发布

这里写自定义目录标题

自己瞎整的，供参考
代码

自己瞎整的，供参考

网上的文档太少了，只能根据自己一边琢磨一边AI的整，最后整了一个还算比较满意的版本出来，尤其是对word中的表格处理这块。

代码

直接贴代码

pom.xml

<dependency>
	<groupId>org.apache.poi</groupId>
	<artifactId>poi-ooxml</artifactId>
	<version>4.1.2</version>
</dependency>

package com.ruoyi.project.project.util;

import cn.hutool.core.map.MapUtil;
import cn.hutool.core.util.RandomUtil;
import com.alibaba.fastjson2.JSONArray;
import com.ruoyi.framework.config.RuoYiConfig;
import com.ruoyi.framework.config.ServerConfig;
import org.docx4j.dml.wordprocessingDrawing.Inline;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.Part;
import org.docx4j.openpackaging.parts.WordprocessingML.BinaryPart;
import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart;
import org.docx4j.openpackaging.parts.relationships.RelationshipsPart;
import org.docx4j.wml.*;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import javax.annotation.PostConstruct;
import javax.xml.bind.JAXBElement;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Component
public class DocUtil {

    // 定义汉字数字
    private static final String CHINESE_NUMBERS = "一二三四五六七八九十";
    // 定义正则表达式模式
    private static final String PATTERN_STRING = "^第[" + CHINESE_NUMBERS + "]+章";
    private static final String PATTERN_STRING1 = "^第[" + CHINESE_NUMBERS + "]+节";
    static ServerConfig serverConfig;
    @Autowired
    ServerConfig tempServerConfig;

	//判断是否第一章这种一级标题
    public static boolean isPara(String input) {
        Pattern pattern = Pattern.compile(PATTERN_STRING);
        Matcher matcher = pattern.matcher(input);
        return matcher.find();
    }

//判断是否第一节这种二级标题
    public static boolean isPara1(String input) {
        Pattern pattern = Pattern.compile(PATTERN_STRING1);
        Matcher matcher = pattern.matcher(input);
        return matcher.find();
    }

    public static void main(String[] args) throws Docx4JException, IOException {
        // 加载 docx 文件
        WordprocessingMLPackage wordprocessingMLPackage = WordprocessingMLPackage.load(new File("/Users/xxxx/Desktop/workspace/表格测试.docx"));
        try {
//            获取文档中的所有嵌入部件（包括图片）
            MainDocumentPart part = wordprocessingMLPackage.getMainDocumentPart();
            List<Object> list = part.getContent();
            List<Map<String, Object>> maps = parseObject(list, part.getRelationshipsPart());
            List<Map<String, Object>> targetList = DocUtil.formatDocumentObject(maps, null);
            targetList = DocUtil.formatDocumentTbl(targetList);
            System.out.println("最后的结果：---------------------");
            for (Map<String, Object> stringObjectMap : targetList) {
                System.out.println(stringObjectMap);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static List<Map<String, Object>> formatDocumentTbl(List<Map<String, Object>> maps) {
        for (Map<String, Object> map : maps) {
//            判断是否为表格
            if (!map.containsKey("type") || !map.get("type").equals("TBL")) {
                continue;
            }
            ArrayList<Map<String, Object>> tblContent = (ArrayList<Map<String, Object>>) map.get("content");
//                    每一行数据
            for (int i = 0; i < tblContent.size(); i++) {
                Map<String, Object> row = tblContent.get(i);
                ArrayList<Map<String, Object>> rowData = (ArrayList<Map<String, Object>>) row.get("content");
                int index = 0;
                int size = rowData.size();
                for (int i1 = 0; i1 < size; i1++) {
                    Map<String, Object> stringObjectMap = rowData.get(i1);
                   // 这个是对单元格处理，vMerge是竖向合并，hMerge是横向合并
//                            这里才是每一个格的数据，
//                          获得当前的下标
//                        判断是不是有vMerge，有的话如果是1就得找到上一级然后加上1
                    if (stringObjectMap.containsKey("vMerge")) {
                        String str = stringObjectMap.get("vMerge").toString();
                        if (str.equals("restart")) {
                            stringObjectMap.put("vMerge", 1);
                        } else if (str.equals("continue")) {
                            searchAndChangePreviousData(i, index, tblContent);
                            stringObjectMap.remove("vMerge");
                        }
                    }
                    if (stringObjectMap.containsKey("hMerge")) {
                        index += ((BigInteger) stringObjectMap.get("hMerge")).intValue();
                    } else {
                        index++;
                    }
                }
            }
            for (Map<String, Object> tempMap : tblContent) {
                List<Map<String, Object>> content = (List<Map<String, Object>>) tempMap.get("content");
                for (int i = 0; i < content.size(); i++) {

                    Map<String, Object> stringObjectMap = content.get(i);
                    if (stringObjectMap.isEmpty()) {
                        content.remove(i);
                        i--;
                    } else if (stringObjectMap.containsKey("hMerge") && stringObjectMap.size() == 1) {
                        content.remove(i);
                        i--;
                    }
                }
            }
            map.put("content", JSONArray.toJSONString(tblContent));
        }
        return maps;
    }

    private static void searchAndChangePreviousData(int rowIndex, int index, ArrayList<Map<String, Object>> tblContent) {
        for (int i = rowIndex - 1; i >= 0; i--) {
            Map<String, Object> row = tblContent.get(i);
            ArrayList<Map<String, Object>> rowData = (ArrayList<Map<String, Object>>) row.get("content");
            int index1 = 0;
            for (int i1 = 0; i1 < rowData.size(); i1++) {
                Map<String, Object> stringObjectMap = rowData.get(i1);
//                            这里才是每一个格的数据，获得当前的下标
                if (index1 == index) {
                    if (stringObjectMap.containsKey("vMerge") && !stringObjectMap.get("vMerge").equals("") && !stringObjectMap.get("vMerge").equals("continue")) {
                        int flag = (int) stringObjectMap.get("vMerge");
                        stringObjectMap.put("vMerge", flag + 1);
                        return;
                    }
                }
                if (stringObjectMap.containsKey("hMerge")) {
                    index1 += ((BigInteger) stringObjectMap.get("hMerge")).intValue();
                } else {
                    index1++;
                }
            }
        }
    }

    public static List<Map<String, Object>> parseObject(List<Object> list, RelationshipsPart part) {
        List<Map<String, Object>> resultList = new ArrayList<>();
        for (Object obj1 : list) {
            if (obj1 instanceof P) {
                P p = (P) obj1;
                List<Object> content = p.getContent();
                resultList.add(MapUtil.builder(new HashMap<String, Object>()).put("type", "P").put("content", parseObject(content, part)).build());
            } else if (obj1 instanceof R) {
                R r = (R) obj1;
                List<Object> content = r.getContent();
                resultList.add(MapUtil.builder(new HashMap<String, Object>()).put("type", "R").put("content", parseObject(content, part)).build());
            } else if (obj1 instanceof JAXBElement) {
                JAXBElement element = (JAXBElement) obj1;
                Object value = element.getValue();
                if (value instanceof Tbl) {
                    List<Object> tableList = new ArrayList<>();
                    Tbl table = (Tbl) value;
                    List<Object> rows = table.getContent();
                    for (Object obj : rows) {
                        if (obj instanceof Tr) {
                            Tr row = (Tr) obj;
                            List<Object> cells = row.getContent();
                            tableList.add(parseObject(cells, part));
                        }
                    }
                    resultList.add(MapUtil.builder(new HashMap<String, Object>()).put("type", "TBL").put("content", tableList).build());
                } else if (value instanceof Tc) {
                    Tc tc = (Tc) value;
                    List<Object> content = tc.getContent();
                    Map<String, Object> build = MapUtil.builder(new HashMap<String, Object>()).put("type", "tc").put("content", parseObject(content, part)).build();
                    if (tc.getTcPr().getGridSpan() != null) {
                        build.put("hMerge", tc.getTcPr().getGridSpan().getVal());
                    }
                    if (tc.getTcPr().getVMerge() != null) {
                        if (tc.getTcPr().getVMerge().getVal() == null) {
                            build.put("vMerge", "continue");
                        } else {
                            build.put("vMerge", tc.getTcPr().getVMerge().getVal());
                        }
                    }
                    resultList.add(build);
                } else if (value instanceof Drawing) {
                    Drawing image = (Drawing) value;
                    Object o = image.getAnchorOrInline().get(0);
                    if (o instanceof Inline) {
                        Inline inline = (Inline) o;
                        String relId = inline.getGraphic().getGraphicData().getPic().getBlipFill().getBlip().getEmbed();
                        //获取图片的像素宽高
                        long cx = inline.getGraphic().getGraphicData().getPic().getSpPr().getXfrm().getExt().getCx();
                        long cy = inline.getGraphic().getGraphicData().getPic().getSpPr().getXfrm().getExt().getCy();
                        int widthPx = emuToPixels((cx), 96);
                        int heightPx = emuToPixels((cy), 96);
//                        获取对应的图像Part
                        Part imagePart = part.getPart(relId);
                        String fileurl = "1";
                        String filename = "image_" + System.currentTimeMillis() + RandomUtil.randomNumbers(6) + ".png";
                        if (imagePart != null && imagePart instanceof BinaryPart) {
//                            获取图片的二进制数据
                            byte[] imageData = ((BinaryPart) imagePart).getBytes();
//                            将图片保存为文件
                            fileurl = saveFile(imageData, filename);
                        }
                        resultList.add(MapUtil.builder(new HashMap<String, Object>()).put("type", "IMG")
                                .put("width", widthPx).put("height", heightPx).put("content", fileurl).build());
                    }
                } else if (value instanceof Text) {
                    Text text = (Text) value;
//                    resultList.add(text.getValue());
                    if (!text.getValue().isEmpty() && !text.getValue().startsWith("HYPERLINK"))
                        resultList.add(MapUtil.builder(new HashMap<String, Object>()).put("type", "TEXT").put("content", text.getValue()).build());
                }
            }
        }
        return resultList;
    }

    // EMU 转英寸
    private static double emuToInches(long emu) {
        return emu / 914400.0;
    }

    // EMU 转像素（需指定 DPI）
    private static int emuToPixels(long emu, int dpi) {
        return (int) (emuToInches(emu) * dpi);
    }

	//这块可能得你们自己调整一下，或者直接用你们自己的保存文件方法，返回保存的路径就行了
    private static String saveFile(byte[] imageData, String filename) {
        // 上传文件路径,你本地的保存路径
        String filePath = "xxxxxxxxxxx";
        // 上传并返回新文件名称
        try {
            File file = new File(filePath + "/newFolder/");
            if (!file.exists()) file.mkdirs();
            FileOutputStream fos = new FileOutputStream(filePath + "/newFolder/" + filename);
            fos.write(imageData);
            System.out.println("Image saved successfully.");
        } catch (FileNotFoundException e) {
            System.out.println("Image saved error.");
            throw new RuntimeException(e);
        } catch (IOException e) {
            System.out.println("Image saved error.");
            throw new RuntimeException(e);
        }
        return "/profile/upload/newFolder/" + filename;
    }

    public static List<Map<String, Object>> formatDocumentObject(List<Map<String, Object>> maps, Map<String, Object> parentMap) {
        List<Map<String, Object>> resultList = new ArrayList<>();
        for (int i = 0; i < maps.size(); i++) {
            Map<String, Object> map = maps.get(i);
            if (map.get("type").equals("R")) {
                List<Map<String, Object>> content = formatDocumentObject((List<Map<String, Object>>) map.get("content"), map);
                resultList.addAll(content);
            } else if (map.get("type").equals("tc")) {
                List<Map<String, Object>> content = formatDocumentObject((List<Map<String, Object>>) map.get("content"), map);
                StringBuffer sb = new StringBuffer();
                List<Map<String, Object>> tempList = new ArrayList<>();
                if (content.size() > 0) {
                    for (Map<String, Object> stringObjectMap : content) {
                        if (stringObjectMap.containsKey("type") && stringObjectMap.get("type").equals("TEXT")) {
                            sb.append(stringObjectMap.get("content"));
                            sb.append("\n");
                        } else {
                            if (sb.length() > 0) {
                                Map<String, Object> tempMap = new HashMap<>();
                                tempMap.put("type", "TEXT");
                                tempMap.put("content", sb.toString().substring(0, sb.toString().length() - 1));
                                tempList.add(tempMap);
                            }
                            tempList.add(stringObjectMap);
                            sb = new StringBuffer();
                        }
                    }
                }
                if (sb.length() != 0) {
                    Map<String, Object> tempMap = new HashMap<>();
                    tempMap.put("type", "TEXT");
                    tempMap.put("content", sb.toString().substring(0, sb.toString().length() - 1));
                    tempList.add(tempMap);
                    sb = new StringBuffer();
                }
                content = tempList;
                if (map.containsKey("hMerge") || map.containsKey("vMerge")) {
                    if (content.isEmpty() && map.get("vMerge") != null && map.get("vMerge").equals("continue")) {
                        Map<String, Object> tempMap = new HashMap<>();
                        tempMap.put("vMerge", map.get("vMerge"));
                        if (map.containsKey("hMerge")) {
                            tempMap.put("hMerge", map.get("hMerge"));
                        }
                        content.add(tempMap);
                    } else {
                        if (map.containsKey("hMerge")) {
                            for (Map<String, Object> stringObjectMap : content) {
                                stringObjectMap.put("hMerge", map.get("hMerge"));
                            }
                        }
                        if (map.containsKey("vMerge")) {
                            for (Map<String, Object> stringObjectMap : content) {
                                stringObjectMap.put("vMerge", map.get("vMerge"));
                            }
                        }
                    }
                }
                if (content.size() > 1) {
                    Map<String, Object> tempMap = new HashMap<>();
                    tempMap.put("type", "ARRAY");
                    tempMap.put("content", content);
                    content = new ArrayList<>();
                    content.add(tempMap);
                }
                resultList.addAll(content);
            } else if (map.get("type").equals("P")) {
//                P开头的，一般是一段，里面都是一句话,可能还有其他结构
                List<Map<String, Object>> content = formatDocumentObject((List<Map<String, Object>>) map.get("content"), map);
                if (content.isEmpty()) {
                    if (parentMap == null) {
//                        解决了竖项合并单元格
                        resultList.add(MapUtil.builder(new HashMap<String, Object>()).put("type", "TEXT").put("content", "").build());
                    } else if (parentMap.get("type").equals("tc") && parentMap.containsKey("vMerge") && parentMap.get("vMerge").equals("continue")) {
//                        解决单元格内容为空时的问题
                    } else {
                        resultList.add(MapUtil.builder(new HashMap<String, Object>()).put("type", "TEXT").put("content", "").build());
                    }
                } else if (content.size() == 1) {
                    resultList.addAll(content);
                } else {
                    boolean isAllText = true;
                    StringBuffer sb = new StringBuffer();
                    for (Map<String, Object> stringObjectMap : content) {
                        if (stringObjectMap.containsKey("type") && !stringObjectMap.get("type").equals("TEXT"))
                            isAllText = false;
                        sb.append(stringObjectMap.get("content"));
                    }
                    if (isAllText) {
                        resultList.add(MapUtil.builder(new HashMap<String, Object>()).put("type", "TEXT").put("content", sb.toString()).build());
                    } else {
                        resultList.add(MapUtil.builder(new HashMap<String, Object>()).put("type", "ARRAY").put("content", content).build());
                    }
                }

            } else if (map.get("type").equals("TBL")) {
                Map<String, Object> newMap = new HashMap<>();
                newMap.put("type", "TBL");
                newMap.put("content", formatDocumentObjectList((List<Object>) map.get("content")));
                resultList.add(newMap);
            } else {
                resultList.add(map);
            }
        }
        return resultList;
    }

    private static List<Map<String, Object>> formatDocumentObjectList(List<Object> content) {
        List<Map<String, Object>> resultList = new ArrayList<>();
        for (Object o : content) {
            if (o instanceof ArrayList) {
                ArrayList oList = (ArrayList) o;
                if (oList.get(0) instanceof ArrayList) {
                    resultList.addAll(formatDocumentObjectList(oList));
                } else if (oList.get(0) instanceof HashMap) {
                    Map<String, Object> newMap = new HashMap<>();
                    newMap.put("type", "TD");
                    newMap.put("content", formatDocumentObject(oList, newMap));
                    resultList.add(newMap);
                }
            } else if (o instanceof HashMap) {
                HashMap oMap = (HashMap) o;
                resultList.add(oMap);
            }
        }
        return resultList;
    }

    @PostConstruct
    public void init() {
        serverConfig = this.tempServerConfig;
    }
}