parse.js 20 KB


  1. // 1. 读取指定目录文件, 获取文件内容.
  2. // 2. 解析内容, 获取图片的标注信息
  3. // 3. 统计标注信息
  4. const fs = require('fs');
  5. const path = require('path');
  6. const readline = require('readline');
  7. const xml2js = require('xml2js');
  8. function readFile(filePath) {
  9. return new Promise((resolve, reject) => {
  10. fs.readFile(filePath, 'utf-8', (err, data) => {
  11. if (err) return reject(err);
  12. resolve(data);
  13. });
  14. })
  15. }
  16. function readDir(filePath) {
  17. return new Promise((resolve, reject) => {
  18. fs.readdir(filePath, (err, files) => {
  19. if (err) return reject(err);
  20. resolve(files);
  21. });
  22. })
  23. }
  24. function parseYolo(fileData) {
  25. // 解析文件内容, 获取图片的标注信息
  26. // 类型 x坐标 y坐标 宽度 高度
  27. // 1 0.391297 0.095892 0.280578 0.179688
  28. let result = [];
  29. let lines = fileData.split('\n');
  30. lines.forEach(line => {
  31. // 判断是否为空行
  32. if (line.trim() === '') return;
  33. let [type, x, y, width, height] = line.split(' ');
  34. result.push({type, x, y, width, height});
  35. });
  36. return result;
  37. }
  38. async function writeFile(filePath, fileData) {
  39. return new Promise((resolve, reject) => {
  40. // 判断父级目录是否存在
  41. let dirPath = path.dirname(filePath);
  42. if (!fs.existsSync(dirPath)) {
  43. fs.mkdirSync(dirPath, {recursive: true});
  44. }
  45. fs.writeFile(filePath, fileData, (err, data) => {
  46. if (err) return reject(err);
  47. resolve(data);
  48. });
  49. })
  50. }
  51. async function mvFile(filePath, targetPath) {
  52. return new Promise((resolve, reject) => {
  53. // 判断父级目录是否存在
  54. let dirPath = path.dirname(filePath);
  55. if (!fs.existsSync(dirPath)) {
  56. fs.mkdirSync(dirPath, {recursive: true});
  57. }
  58. fs.rename(filePath, targetPath, (err, data) => {
  59. if (err) return reject(err);
  60. resolve(data);
  61. });
  62. })
  63. }
  64. // 写入文件
  65. async function writeFile(filePath, fileData) {
  66. return new Promise((resolve, reject) => {
  67. // 判断父级目录是否存在
  68. let dirPath = path.dirname(filePath);
  69. if (!fs.existsSync(dirPath)) {
  70. fs.mkdirSync(dirPath, {recursive: true});
  71. }
  72. fs.writeFile(filePath, fileData, (err, data) => {
  73. if (err) return reject(err);
  74. resolve(data);
  75. });
  76. })
  77. }
  78. async function cpFile(filePath, targetPath) {
  79. return new Promise((resolve, reject) => {
  80. // 判断父级目录是否存在
  81. let dirPath = path.dirname(targetPath);
  82. if (!fs.existsSync(dirPath)) {
  83. console.log(`路径${dirPath} 不存在`)
  84. fs.mkdirSync(dirPath, {recursive: true});
  85. console.log('路径不存在')
  86. }
  87. // console.log('路径不存在')
  88. fs.copyFile(filePath, targetPath, (err, data) => {
  89. if (err) return reject(err);
  90. resolve(data);
  91. });
  92. })
  93. }
  94. // 删除指定目录
  95. function rmDir(path) {
  96. return new Promise((resolve, reject) => {
  97. fs.rmdir(path, {recursive: true}, (err, data) => {
  98. if (err) return reject(err);
  99. resolve(data);
  100. });
  101. })
  102. }
  103. const dimensionType = {
  104. "-1": "无标注",
  105. "0": "吊车",
  106. "1": "塔吊",
  107. "2": "烟火",
  108. "3": "施工机械",
  109. "4": "导线异物",
  110. "5": "烟雾"
  111. }
  112. async function main() {
  113. // let labelsPath = `E:\\图库\\ai\\epower_v2\\newLabels`
  114. // let labelsPath = `E:\\图库\\ai\\epower_v2\\处理labels`
  115. // let labelsPath = `E:\\图库\\ai\\epower_v2\\labels`
  116. let labelsPath = `E:\\图库\\ai\\epower\\all_tmpLabels_2`
  117. let imagesPath = `E:\\图库\\ai\\epower\\images`
  118. let emptyPath = `E:\\图库\\ai\\epower\\empty`
  119. let imageExts = ['jpg']
  120. let logsPath = `E:\\图库\\ai\\epower\\logs`
  121. // 数据转移目录
  122. let transferPath = `E:\\图库\\ai\\epower\\parse`
  123. // 清除数据转移目录
  124. // 判断数据转移目录是否存在
  125. if (fs.existsSync(transferPath)) { // 删除数据转移目录
  126. await rmDir(transferPath);
  127. }
  128. let countMap = {};
  129. let logStrArr = [];
  130. let resultStrArr = [];
  131. // 获取文件列表
  132. let labelFiles = await readDir(labelsPath);
  133. let imagesFiles = await readDir(imagesPath);
  134. // 获取当前时间
  135. let now = new Date();
  136. let nowStr = `${now.getFullYear()}-${now.getMonth() + 1}-${now.getDate()} ${now.getHours()}点${now.getMinutes()}分${now.getSeconds()}秒`;
  137. resultStrArr.push(`[I] [START] 开始统计分析数据标注信息 ${nowStr}`);
  138. resultStrArr.push(`[I] 原标注信息路径: ${labelsPath}`);
  139. resultStrArr.push(`[I] 原图片文件路径: ${labelsPath}`);
  140. resultStrArr.push(`[I] 输出文件路径: ${labelsPath}`);
  141. resultStrArr.push(`[I] 输出文件路径: ${logsPath}`);
  142. resultStrArr.push(`[I] 标注文件数量: ${labelFiles.length}`);
  143. resultStrArr.push(`[I] 图片文件数量: ${imagesFiles.length}`);
  144. resultStrArr.push(`[I] 图片后缀: ${imageExts}`);
  145. // 判断labels中是否包含 classes.txt. 如果有则 将文件 复制至 transferPath 目录
  146. if (fs.existsSync(path.join(labelsPath, 'classes.txt'))) {
  147. await cpFile(path.join(labelsPath, 'classes.txt'), path.join(transferPath, 'classes.txt'))
  148. }
  149. // 将 resultArr 的内容, 转移至 logStrArr
  150. resultStrArr.forEach(item => {
  151. logStrArr.push(item);
  152. });
  153. for (const [i, labelFile] of labelFiles.entries()) {
  154. let filePath = path.join(labelsPath, labelFile);
  155. // 获取对应的图片路径, 与 label 文件的 名称相同
  156. let labelName = labelFile.replace('.txt', '');
  157. // 读取文件内容
  158. let fileData = await readFile(filePath);
  159. // 解析文件内容
  160. let result = parseYolo(fileData);
  161. // 图片后缀可能为 jpg, png 等
  162. let imagePath = '';
  163. let imageName = labelName;
  164. for (let j = 0; j < imageExts.length; j++) {
  165. imagePath = path.join(imagesPath, `${labelName}.${imageExts[0]}`);
  166. imageName = `${labelName}.${imageExts[0]}`;
  167. if (!fs.existsSync(imagePath)) {
  168. imagePath = '';
  169. continue;
  170. }
  171. }
  172. // 判断图片是否存在
  173. if (!imagePath) {
  174. logStrArr.push(`[E] 图片不存在 ${filePath} ${fileData} 可能是图片后缀异常 [ ${imageExts.join(', ')} ]`);
  175. continue;
  176. }
  177. readline.cursorTo(process.stdout, 0);
  178. readline.clearScreenDown(process.stdout);
  179. process.stdout.write(`${i} / ${labelFiles.length} | 解析文件: ${filePath} 中\n`);
  180. // console.log(result);
  181. // 空数据统计
  182. if (result.length === 0) {
  183. countMap['空'] = (countMap['空'] || 0) + 1;
  184. // 复制图片文件到 emptyPath
  185. let emptyFilePath = path.join(emptyPath, `${labelName}.${imageExts[0]}`);
  186. logStrArr.push(`[E] ${labelName}内容为空 空文件 ${emptyFilePath}`);
  187. console.log(`空文件: ${emptyFilePath}`)
  188. cpFile(imagePath, emptyFilePath).then(_ => _)
  189. continue;
  190. }
  191. // 获取文件标注类型
  192. let fileType = {};
  193. for (let j = 0; j < result.length; j++) {
  194. // let type = item.type;
  195. let typeName = dimensionType[result[j].type];
  196. if (!typeName) {
  197. console.log('未知类型')
  198. logStrArr.push(`[E] 未知类型 ${filePath} ${fileData}`);
  199. continue;
  200. }
  201. if (fileType[typeName]) {
  202. fileType[typeName]++;
  203. } else {
  204. fileType[typeName] = 1;
  205. }
  206. }
  207. console.log(fileType)
  208. if (Object.keys(fileType).length === 0) {
  209. countMap['异常标注'] = (countMap['异常标注'] || 0) + 1;
  210. continue;
  211. }
  212. // 获取标注的所有类型, 将图片进行区分 类型1:数量 类型2:数量 类型1-类型2:数量
  213. let fileTypeKeys = Object.keys(fileType);
  214. // 排序
  215. fileTypeKeys.sort();
  216. // 数据转换
  217. fileTypeKeys = fileTypeKeys.join('-');
  218. // 转换类型
  219. if (countMap[fileTypeKeys]) {
  220. countMap[fileTypeKeys]++;
  221. } else {
  222. countMap[fileTypeKeys] = 1;
  223. }
  224. // 尝试复制文件
  225. let transferFilePath = path.join(transferPath, fileTypeKeys);
  226. let transferImagePath = path.join(transferFilePath, `images/${imageName}`);
  227. let transferLabelPath = path.join(transferFilePath, `labels/${labelFile}`);
  228. // 文件转移
  229. // 同步复制文件
  230. cpFile(filePath, transferLabelPath).then(_ => _)
  231. cpFile(imagePath, transferImagePath).then(_ => _)
  232. logStrArr.push(`${fileTypeKeys}: ${countMap[fileTypeKeys]} ${filePath} ${imagePath} ===> ${transferLabelPath} ${transferImagePath} `);
  233. console.log(logStrArr[logStrArr.length - 1]);
  234. }
  235. // 输出结果
  236. console.log(countMap);
  237. // 将分析结果写入文件
  238. logStrArr.push(`\r\n${JSON.stringify(countMap, null, 4)}`);
  239. resultStrArr.push(`${JSON.stringify(countMap, null, 4)}`);
  240. // 保存结果
  241. let logStr = logStrArr.join('\r\n');
  242. let resultStr = resultStrArr.join('\r\n');
  243. await writeFile(path.join(logsPath, 'result.txt'), resultStr);
  244. await writeFile(path.join(logsPath, 'log.txt'), logStr);
  245. console.log('end');
  246. }
  247. // 将新处理的labels 重新写回至原目录
  248. function reCopyToRaw() {
  249. let labelsPath = `E:\\图库\\ai\\epower\\newLabels`
  250. // 数据转移目录
  251. let transferPath = `E:\\图库\\ai\\epower\\parse`
  252. // "0": "吊车",
  253. // "1": "塔吊",
  254. // "2": "烟火",
  255. // "3": "施工机械",
  256. // "4": "导线异物",
  257. // 吊车-导线异物
  258. // 施工机械-导线异物
  259. // 塔吊-导线异物
  260. // 塔吊-施工机械
  261. let fileTypeKeys = [1, 3]
  262. fileTypeKeys = fileTypeKeys.map(item => dimensionType[item] ? dimensionType[item] : '未知类型');
  263. let targetType = fileTypeKeys.join('-');
  264. let targetPath = path.join(transferPath, targetType);
  265. let targetLabelPath = path.join(targetPath, 'labels');
  266. let logStrArr = [];
  267. // 遍历目录
  268. let labelFiles = fs.readdirSync(targetLabelPath);
  269. for (let i = 0; i < labelFiles.length; i++) {
  270. let labelFilePath = path.join(targetLabelPath, labelFiles[i]);
  271. // 拷贝文件
  272. cpFile(labelFilePath, path.join(labelsPath, labelFiles[i])).then(_ => _)
  273. }
  274. console.log(targetPath);
  275. }
  276. // 复制分类好的label文件至指定目录
  277. async function copyLabel() {
  278. // let allLabelsPath = `E:\\图库\\ai\\epower_v2\\parse`
  279. let allLabelsPath = `E:\\图库\\ai\\epower\\parse`
  280. // let transferPath = `E:\\图库\\ai\\epower_v2\\tmpLabels_2`
  281. let transferPath = `E:\\图库\\ai\\epower\\tmpLabels_2`
  282. // 获取子目录
  283. let subDirs = fs.readdirSync(allLabelsPath);
  284. for (let i = 0; i < subDirs.length; i++) {
  285. if (subDirs[i] == 'classes.txt') {
  286. console.log(`skip classes.txt`)
  287. continue;
  288. }
  289. let subDir = path.join(allLabelsPath, subDirs[i], 'labels');
  290. console.log(subDir)
  291. // 获取子目录下的labels
  292. let labelFiles = fs.readdirSync(subDir);
  293. for (let j = 0; j < labelFiles.length; j++) {
  294. // 获取文件路径
  295. let labelFilePath = path.join(subDir, labelFiles[j]);
  296. // 复制文件 至新目录 新目录: 新目录/子目录名称
  297. let newDir = path.join(transferPath, subDirs[i]);
  298. let newFilePath = path.join(newDir, labelFiles[j]);
  299. // 复制文件
  300. await cpFile(labelFilePath, newFilePath)
  301. }
  302. }
  303. }
  304. /**
  305. * 将分类好的label文件复制至指定目录
  306. * @param allClassLabelsPath
  307. * @param baseImagesPath
  308. * @param resultPath
  309. * @return {Promise<void>}
  310. */
  311. async function copyLabelToAll(allClassLabelsPath, baseImagesPath, resultPath) {
  312. console.log(`将尝试通过分类后的label文件进行最终数据获取\n
  313. labels:${allClassLabelsPath} \n
  314. images:${baseImagesPath} \n
  315. 复制至 ${resultPath} 下`)
  316. let result_labelsPath = path.join(resultPath, 'labels');
  317. let result_imagesPath = path.join(resultPath, 'images');
  318. // 创建目录
  319. let imageExts = ['jpg']
  320. // 获取子目录
  321. let subDirs = fs.readdirSync(allClassLabelsPath);
  322. for (let i = 0; i < subDirs.length; i++) {
  323. if (subDirs[i] == 'classes.txt') {
  324. console.log(`skip classes.txt`)
  325. continue;
  326. }
  327. let subDir = path.join(allClassLabelsPath, subDirs[i]);
  328. console.log(subDir)
  329. // 获取子目录下的labels
  330. let labelFiles = fs.readdirSync(subDir);
  331. for (let j = 0; j < labelFiles.length; j++) {
  332. if (labelFiles[j] == 'classes.txt') {
  333. console.log(`skip classes.txt`)
  334. continue;
  335. }
  336. let labelName = labelFiles[j].replace('.txt', '');
  337. // 获取文件路径
  338. let labelFilePath = path.join(subDir, labelFiles[j]);
  339. // 复制文件 至新目录 新目录: 新目录/子目录名称
  340. let newFilePath = path.join(result_labelsPath, labelFiles[j]);
  341. // 复制文件
  342. cpFile(labelFilePath, newFilePath)
  343. // 复制图片
  344. // 图片后缀可能为 jpg, png 等
  345. let imagePath = '';
  346. let imageName = `${labelName}.jpg`;
  347. imagePath = path.join(baseImagesPath, imageName);
  348. // 复制图片
  349. let newImagePath = path.join(result_imagesPath, imageName);
  350. console.log(`移动图片 ${imagePath} => ${newImagePath}`)
  351. // 判断图片是否存在
  352. if (!imagePath) {
  353. console.log(`[E] 图片不存在 ${labelFilePath} 可能是图片后缀异常 [ ${imageExts.join(', ')} ]`);
  354. continue;
  355. }
  356. cpFile(imagePath, newImagePath)
  357. }
  358. }
  359. }
  360. // 通过labels文件获取图片
  361. async function getImageByLabel(labelsPath, baseImagePath, resultPath) {
  362. console.log(`将尝试通过label文件进行最终数据获取\n
  363. labels:${labelsPath} \n
  364. images:${baseImagePath} \n
  365. 复制至 ${resultPath} 下`)
  366. let labelFiles = fs.readdirSync(labelsPath);
  367. let result_labelsPath = path.join(resultPath, 'labels');
  368. let result_imagesPath = path.join(resultPath, 'images');
  369. for (let i = 0; i < labelFiles.length; i++) {
  370. let labelFilePath = path.join(labelsPath, labelFiles[i]);
  371. if (!fs.statSync(labelFilePath).isFile()) {
  372. console.log(`[E] ${labelFiles[i]} 不是文件, 该函数暂不支持`)
  373. continue;
  374. }
  375. // 获取文件名
  376. let labelName = labelFiles[i].replace('.txt', '');
  377. // 文件名转换为图片名
  378. let imageName = `${labelName}.jpg`;
  379. let result_labelFilePath = path.join(result_labelsPath, labelFiles[i]);
  380. let imagePath = path.join(baseImagePath, imageName);
  381. if (!fs.statSync(imagePath).isFile()) {
  382. console.error(`[E] 无法找到图片文件${imagePath}`)
  383. continue;
  384. }
  385. cpFile(labelFilePath, result_labelFilePath)
  386. cpFile(imagePath, path.join(result_imagesPath, imageName))
  387. }
  388. }
  389. /**
  390. *
  391. * @param input
  392. * @return {*}
  393. */
  394. function replaceNewlines(input) {
  395. return input.replace(/(\r\n|\r|\n)/g, '\r\n');
  396. }
  397. // 将label 文件中的换行 从 \n 替换为 \r\n
  398. function relineLabel() {
  399. let labelsPath = `E:\\图库\\ai\\epower\\all_tmpLabels_2`
  400. // 遍历目录
  401. let labelFiles = fs.readdirSync(labelsPath);
  402. for (let i = 0; i < labelFiles.length; i++) {
  403. let labelFilePath = path.join(labelsPath, labelFiles[i]);
  404. if (!fs.statSync(labelFilePath).isFile()) {
  405. continue;
  406. }
  407. console.log(`start reline ${labelFilePath}`);
  408. let fileContent = fs.readFileSync(labelFilePath, 'utf-8');
  409. fileContent = replaceNewlines(fileContent);
  410. fs.writeFileSync(labelFilePath, fileContent)
  411. }
  412. }
  413. // 读取classes.txt
  414. // 将classes.txt 中的的内容进行分组
  415. // 随后读取对应的文件目录, 解析xml文件 , 将其中的坐标等信息转换为yolo格式
  416. async function xmlToYolo(basePath, resultPath) {
  417. // 读取classes.txt
  418. let classesPath = path.join(basePath, 'classes.txt');
  419. let classes = fs.readFileSync(classesPath, 'utf-8').split('\n');
  420. let classesMap = {};
  421. for (let i = 0; i < classes.length; i++) {
  422. let className = classes[i].trim();
  423. className = className.replace(/\s|\r\n|\n|\r/g, '');
  424. classesMap[className] = i;
  425. }
  426. // 拷贝classes.txt
  427. cpFile(classesPath, path.join(resultPath, 'classes.txt'))
  428. console.log(classesMap)
  429. // 遍历目录
  430. let labelFiles = fs.readdirSync(basePath);
  431. for (let i = 0; i < labelFiles.length; i++) {
  432. let fileName = labelFiles[i];
  433. let labelFilePath = path.join(basePath, fileName);
  434. // console.log(`start parse ${labelFilePath}`)
  435. if (fs.statSync(labelFilePath).isDirectory()) {
  436. continue;
  437. }
  438. // 判断文件是否为xml格式
  439. if (!fileName.endsWith('.xml')) {
  440. console.log(`[E] ${fileName} 不是xml, 该函数暂不支持解析`)
  441. continue;
  442. }
  443. let yoloFileName = fileName.replace('.xml', '.txt');
  444. let resultFilePath = path.join(resultPath, yoloFileName);
  445. await yoloXml2yolo(labelFilePath, classesMap, resultFilePath);
  446. }
  447. }
  448. function convert(size, box) {
  449. const dw = 1 / size.width;
  450. const dh = 1 / size.height;
  451. // 数据类型转换为float
  452. box.xmin = parseFloat(box.xmin);
  453. box.ymin = parseFloat(box.ymin);
  454. box.xmax = parseFloat(box.xmax);
  455. box.ymax = parseFloat(box.ymax);
  456. const x = (box.xmin + box.xmax) / 2; // (x_min + x_max) / 2.0
  457. const y = (box.ymin + box.ymax) / 2; // (y_min + y_max) / 2.0
  458. const w = box.xmax - box.xmin; // x_max - x_min
  459. const h = box.ymax - box.ymin; // y_max - y_min
  460. const newX = x * dw;
  461. const newW = w * dw;
  462. const newY = y * dh;
  463. const newH = h * dh;
  464. return [newX, newY, newW, newH];
  465. }
  466. function _xmlToJson(xml) {
  467. return new Promise((resolve, reject) => {
  468. let parser = new xml2js.Parser({
  469. explicitArray: false,
  470. mergeAttrs: true,
  471. explicitRoot: false
  472. });
  473. parser.parseString(xml, function (err, result) {
  474. // console.log(result)
  475. resolve(result);
  476. });
  477. })
  478. }
  479. /**
  480. * 将yolo xml文件转换为yolo格式
  481. * @param xmlPath
  482. * @param classMap
  483. * @param resultPath
  484. */
  485. async function yoloXml2yolo(xmlPath, classMap, resultPath) {
  486. let xml = fs.readFileSync(xmlPath, 'utf-8');
  487. // console.log(xml)
  488. let obj = await _xmlToJson(xml)
  489. // 直接生成对应的 x1,y1, x2,y2
  490. // console.log(obj)
  491. let width = obj.size.width;
  492. let height = obj.size.height;
  493. let str = "";
  494. // 如果只有一个对象则
  495. let objects = []
  496. if (!obj.object.length){
  497. console.log(`${obj.filename} object is object`)
  498. console.log(obj)
  499. objects.push(obj.object)
  500. }else {
  501. objects = obj.object
  502. }
  503. for (let i = 0; i < objects.length; i++){
  504. let sub = objects[i];
  505. // console.log(sub)
  506. let objName = classMap[sub.name];
  507. let arr = convert(obj.size, sub.bndbox)
  508. str += `${objName} ${arr.join(" ")}\r\n`
  509. }
  510. // console.log(str)
  511. // 创建并写入文件
  512. await writeFile(resultPath, str)
  513. }
  514. xmlToYolo("E:\\图库\\验证数据集\\labels - 副本", "E:\\图库\\验证数据集\\result")
  515. // main();
  516. /**将分类好的label文件, 拷贝至新目录 parse => newLabels
  517. 只会拷贝对应类型的文件
  518. */
  519. // reCopyToRaw();
  520. /**
  521. * 将分类好的label文件, 拷贝至新目录 parse => tmpLabels_2
  522. * 只拷贝对应的label文件
  523. *
  524. */
  525. // copyLabel();
  526. /**
  527. * 将分类好的label文件, 统一合并至一个文件夹内, 分类好的labels 应该是labels的子目录
  528. * 并且拷贝对应的图片
  529. * 分类好的label文件目录: tmpLabels_2
  530. * 基础图片目录: images
  531. * 输出label目录: all_tmpLabels_2
  532. * 输出图片目录: all_tmpImages_2
  533. */
  534. // 分类好的label文件目录 E:\图库\ai\epower\tmpLabels_2
  535. let allClassLabelsPath = `E:\\图库\\ai\\epower\\验证数据集_labels`
  536. // 基础图片目录
  537. let baseImagesPath = `E:\\图库\\ai\\epower\\images`
  538. // 输出目录
  539. let resultPath = `E:\\图库\\ai\\epower\\验证数据集`
  540. // copyLabelToAll(allClassLabelsPath, baseImagesPath, resultPath)
  541. // getImageByLabel(`E:\\图库\\ai\\epower\\验证数据集_labels`, `E:\\图库\\ai\\epower\\images`, resultPath)
  542. // relineLabel();