2022年 11月 4日

使用python写xml

工作中需要把PPOCRLabel标注格式转化成需要的xml格式。

遇到的问题:1、之前只使用过使用minidom读xml,还没有使用过minidom写xml。2、minidom写xml顺序与写入的顺序不一致问题。

minidom源码在我本地路径:D:\Anaconda3\Lib\xml\dom\minidom.py

参考:Python3.7写入xml文件 保持节点属性顺序不变

首先导入模块 from xml.dom.minidom import Document,然后按下ctrl 使用鼠标点击minidom进入源码

ctrl+F 搜索 a_names = sorted(attrs.keys())。然后注释掉

添加 a_names = attrs.keys()  不使用排序

 

下面是我的PPOCRLabel标注格式转化成需要的xml格式的代码。

  1. # -*- coding : UTF-8 -*-
  2. # @file : conver_json_icdar.py
  3. # @Time : 2021/4/9 11:24
  4. # @Author : wmz
  5. import os
  6. import json
  7. import xml.dom.minidom as minidom
  8. def json_2_icdar(js_path, ic_path):
  9. with open(js_path, 'r', encoding='utf-8') as f:
  10. for line in f.readlines():
  11. print(line)
  12. content = line.split('\t')
  13. print(content[0])
  14. txt_file = str(content[0]).replace('jpg', 'txt')
  15. dst_file = os.path.join(ic_path, txt_file)
  16. # write file
  17. file_lineinfo = open(dst_file, 'w', encoding='utf-8')
  18. list_dict = json.loads(content[1])
  19. nsize = len(list_dict)
  20. print(nsize)
  21. for i in range(nsize):
  22. print(list_dict[i])
  23. lin = list_dict[i]
  24. info = lin['transcription']
  25. points = lin['points']
  26. points = [int(y) for x in points for y in x]
  27. pts = ','.join(map(str, points))
  28. lineinfo = pts + ',' + info + '\n'
  29. file_lineinfo.write(lineinfo)
  30. file_lineinfo.close()
  31. def json_2_xml(js_path, xml_path):
  32. with open(js_path, 'r', encoding='utf-8') as f:
  33. for line in f.readlines():
  34. print(line)
  35. content = line.split('\t')
  36. print(content[0])
  37. xml_file = str(content[0]).replace('jpg', 'xml')
  38. dst_xml_file = os.path.join(xml_path, xml_file)
  39. # txt_file = str(content[0]).replace('jpg', 'txt')
  40. # dst_file = os.path.join(xml_path, txt_file)
  41. # write file
  42. # 1.创建DOM树对象
  43. dom = minidom.Document()
  44. # 2.创建根节点。每次都要用DOM对象来创建任何节点。
  45. root_node = dom.createElement('ImageInfo')
  46. # 3.用DOM对象添加根节点
  47. dom.appendChild(root_node)
  48. # 设置该节点的属性
  49. root_node.setAttribute('bModify', '3')
  50. # file_lineinfo = open(dst_file, 'w', encoding='utf-8')
  51. list_dict = json.loads(content[1])
  52. nsize = len(list_dict)
  53. print(nsize)
  54. for i in range(nsize):
  55. print(list_dict[i])
  56. lin = list_dict[i]
  57. info = lin['transcription']
  58. points = lin['points']
  59. points = [int(y) for x in points for y in x]
  60. # 用DOM对象创建元素子节点
  61. info_node = dom.createElement('LineInfo')
  62. # 用父节点对象添加元素子节点
  63. root_node.appendChild(info_node)
  64. # 设置该节点的属性
  65. info_node.setAttribute('ptLTX', str(points[0]))
  66. info_node.setAttribute('ptLTY', str(points[1]))
  67. info_node.setAttribute('ptRTX', str(points[2]))
  68. info_node.setAttribute('ptRTY', str(points[3]))
  69. info_node.setAttribute('ptRBX', str(points[4]))
  70. info_node.setAttribute('ptRBY', str(points[5]))
  71. info_node.setAttribute('ptLBX', str(points[6]))
  72. info_node.setAttribute('ptLBY', str(points[7]))
  73. info_node.setAttribute('Chars', info)
  74. info_node.setAttribute('bModify', '3')
  75. pts = ','.join(map(str, points))
  76. # lineinfo = pts + ',' + info + '\n'
  77. # file_lineinfo.write(lineinfo)
  78. # file_lineinfo.close()
  79. with open(dst_xml_file, 'w', encoding='UTF-8') as fh:
  80. dom.writexml(fh, indent='', addindent='\t', newl='\n', encoding='UTF-8')
  81. if __name__ == "__main__":
  82. # src_path = r"C:\Users\WT\Desktop\hkb-bz\Label.txt"
  83. src_path = r"C:\Users\WT\Desktop\hkb\Cache.cach"
  84. dst_path = r"C:\Users\WT\Desktop"
  85. # src_path = r"C:\Users\WT\Desktop\户口本\Cache.cach"
  86. # dst_path = r"C:\Users\WT\Desktop"
  87. json_2_xml(src_path, dst_path)