test_tablemaster.py 3.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. import unittest
  2. from PIL import Image
  3. from lxml import etree
  4. from magic_pdf.model.ppTableModel import ppTableModel
  5. class TestppTableModel(unittest.TestCase):
  6. def test_image2html(self):
  7. img = Image.open("tests/test_table/assets/table.jpg")
  8. # 修改table模型路径
  9. config = {"device": "cuda",
  10. "model_dir": "D:/models/PDF-Extract-Kit/models/TabRec/TableMaster"}
  11. table_model = ppTableModel(config)
  12. res = table_model.img2html(img)
  13. # 验证生成的 HTML 是否符合预期
  14. parser = etree.HTMLParser()
  15. tree = etree.fromstring(res, parser)
  16. # 检查 HTML 结构
  17. assert tree.find('.//table') is not None, "HTML should contain a <table> element"
  18. assert tree.find('.//thead') is not None, "HTML should contain a <thead> element"
  19. assert tree.find('.//tbody') is not None, "HTML should contain a <tbody> element"
  20. assert tree.find('.//tr') is not None, "HTML should contain a <tr> element"
  21. assert tree.find('.//td') is not None, "HTML should contain a <td> element"
  22. # 检查具体的表格内容
  23. headers = tree.xpath('//thead/tr/td/b')
  24. print(headers) # Print headers for debugging
  25. assert len(headers) == 5, "Thead should have 5 columns"
  26. assert headers[0].text and headers[0].text.strip() == "Methods", "First header should be 'Methods'"
  27. assert headers[1].text and headers[1].text.strip() == "R", "Second header should be 'R'"
  28. assert headers[2].text and headers[2].text.strip() == "P", "Third header should be 'P'"
  29. assert headers[3].text and headers[3].text.strip() == "F", "Fourth header should be 'F'"
  30. assert headers[4].text and headers[4].text.strip() == "FPS", "Fifth header should be 'FPS'"
  31. # 检查第一行数据
  32. first_row = tree.xpath('//tbody/tr[1]/td')
  33. assert len(first_row) == 5, "First row should have 5 cells"
  34. assert first_row[0].text and first_row[0].text.strip() == "SegLink[26]", "First cell should be 'SegLink[26]'"
  35. assert first_row[1].text and first_row[1].text.strip() == "70.0", "Second cell should be '70.0'"
  36. assert first_row[2].text and first_row[2].text.strip() == "86.0", "Third cell should be '86.0'"
  37. assert first_row[3].text and first_row[3].text.strip() == "77.0", "Fourth cell should be '77.0'"
  38. assert first_row[4].text and first_row[4].text.strip() == "8.9", "Fifth cell should be '8.9'"
  39. # 检查倒数第二行数据
  40. second_last_row = tree.xpath('//tbody/tr[position()=last()-1]/td')
  41. assert len(second_last_row) == 5, "second_last_row should have 5 cells"
  42. assert second_last_row[0].text and second_last_row[0].text.strip() == "Ours (SynText)", "First cell should be 'Ours (SynText)'"
  43. assert second_last_row[1].text and second_last_row[1].text.strip() == "80.68", "Second cell should be '80.68'"
  44. assert second_last_row[2].text and second_last_row[2].text.strip() == "85.40", "Third cell should be '85.40'"
  45. assert second_last_row[3].text and second_last_row[3].text.strip() == "82.97", "Fourth cell should be '82.97'"
  46. assert second_last_row[3].text and second_last_row[4].text.strip() == "12.68", "Fifth cell should be '12.68'"
  47. if __name__ == "__main__":
  48. unittest.main()