工作中需要把PPOCRLabel标注格式转化成需要的xml格式。
遇到的问题:1、之前只使用过使用minidom读xml,还没有使用过minidom写xml。2、minidom写xml顺序与写入的顺序不一致问题。
minidom源码在我本地路径:D:\Anaconda3\Lib\xml\dom\minidom.py
参考:Python3.7写入xml文件 保持节点属性顺序不变
首先导入模块 from xml.dom.minidom import Document,然后按下ctrl 使用鼠标点击minidom进入源码
ctrl+F 搜索 a_names = sorted(attrs.keys())。然后注释掉
添加 a_names = attrs.keys() 不使用排序
下面是我的PPOCRLabel标注格式转化成需要的xml格式的代码。
- # -*- coding : UTF-8 -*-
- # @file : conver_json_icdar.py
- # @Time : 2021/4/9 11:24
- # @Author : wmz
-
- import os
- import json
- import xml.dom.minidom as minidom
-
-
- def json_2_icdar(js_path, ic_path):
- with open(js_path, 'r', encoding='utf-8') as f:
- for line in f.readlines():
- print(line)
- content = line.split('\t')
- print(content[0])
- txt_file = str(content[0]).replace('jpg', 'txt')
- dst_file = os.path.join(ic_path, txt_file)
- # write file
- file_lineinfo = open(dst_file, 'w', encoding='utf-8')
- list_dict = json.loads(content[1])
- nsize = len(list_dict)
- print(nsize)
- for i in range(nsize):
- print(list_dict[i])
- lin = list_dict[i]
- info = lin['transcription']
- points = lin['points']
- points = [int(y) for x in points for y in x]
- pts = ','.join(map(str, points))
- lineinfo = pts + ',' + info + '\n'
- file_lineinfo.write(lineinfo)
- file_lineinfo.close()
-
-
- def json_2_xml(js_path, xml_path):
- with open(js_path, 'r', encoding='utf-8') as f:
- for line in f.readlines():
- print(line)
- content = line.split('\t')
- print(content[0])
- xml_file = str(content[0]).replace('jpg', 'xml')
- dst_xml_file = os.path.join(xml_path, xml_file)
- # txt_file = str(content[0]).replace('jpg', 'txt')
- # dst_file = os.path.join(xml_path, txt_file)
- # write file
- # 1.创建DOM树对象
- dom = minidom.Document()
- # 2.创建根节点。每次都要用DOM对象来创建任何节点。
- root_node = dom.createElement('ImageInfo')
- # 3.用DOM对象添加根节点
- dom.appendChild(root_node)
- # 设置该节点的属性
- root_node.setAttribute('bModify', '3')
-
- # file_lineinfo = open(dst_file, 'w', encoding='utf-8')
- list_dict = json.loads(content[1])
- nsize = len(list_dict)
- print(nsize)
- for i in range(nsize):
- print(list_dict[i])
- lin = list_dict[i]
- info = lin['transcription']
- points = lin['points']
- points = [int(y) for x in points for y in x]
- # 用DOM对象创建元素子节点
- info_node = dom.createElement('LineInfo')
- # 用父节点对象添加元素子节点
- root_node.appendChild(info_node)
- # 设置该节点的属性
- info_node.setAttribute('ptLTX', str(points[0]))
- info_node.setAttribute('ptLTY', str(points[1]))
- info_node.setAttribute('ptRTX', str(points[2]))
- info_node.setAttribute('ptRTY', str(points[3]))
- info_node.setAttribute('ptRBX', str(points[4]))
- info_node.setAttribute('ptRBY', str(points[5]))
- info_node.setAttribute('ptLBX', str(points[6]))
- info_node.setAttribute('ptLBY', str(points[7]))
- info_node.setAttribute('Chars', info)
- info_node.setAttribute('bModify', '3')
-
- pts = ','.join(map(str, points))
- # lineinfo = pts + ',' + info + '\n'
- # file_lineinfo.write(lineinfo)
- # file_lineinfo.close()
- with open(dst_xml_file, 'w', encoding='UTF-8') as fh:
- dom.writexml(fh, indent='', addindent='\t', newl='\n', encoding='UTF-8')
-
-
- if __name__ == "__main__":
- # src_path = r"C:\Users\WT\Desktop\hkb-bz\Label.txt"
- src_path = r"C:\Users\WT\Desktop\hkb\Cache.cach"
- dst_path = r"C:\Users\WT\Desktop"
- # src_path = r"C:\Users\WT\Desktop\户口本\Cache.cach"
- # dst_path = r"C:\Users\WT\Desktop"
- json_2_xml(src_path, dst_path)