| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136 |
- # coding=utf-8
- '''
- Reference: https://huggingface.co/datasets/nielsr/funsd/blob/main/funsd.py
- '''
- import json
- import os
- import datasets
- from .image_utils import load_image, normalize_bbox
- logger = datasets.logging.get_logger(__name__)
- _CITATION = """\
- @article{Jaume2019FUNSDAD,
- title={FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents},
- author={Guillaume Jaume and H. K. Ekenel and J. Thiran},
- journal={2019 International Conference on Document Analysis and Recognition Workshops (ICDARW)},
- year={2019},
- volume={2},
- pages={1-6}
- }
- """
- _DESCRIPTION = """\
- https://guillaumejaume.github.io/FUNSD/
- """
- class FunsdConfig(datasets.BuilderConfig):
- """BuilderConfig for FUNSD"""
- def __init__(self, **kwargs):
- """BuilderConfig for FUNSD.
- Args:
- **kwargs: keyword arguments forwarded to super.
- """
- super(FunsdConfig, self).__init__(**kwargs)
- class Funsd(datasets.GeneratorBasedBuilder):
- """Conll2003 dataset."""
- BUILDER_CONFIGS = [
- FunsdConfig(name="funsd", version=datasets.Version("1.0.0"), description="FUNSD dataset"),
- ]
- def _info(self):
- return datasets.DatasetInfo(
- description=_DESCRIPTION,
- features=datasets.Features(
- {
- "id": datasets.Value("string"),
- "tokens": datasets.Sequence(datasets.Value("string")),
- "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
- "ner_tags": datasets.Sequence(
- datasets.features.ClassLabel(
- names=["O", "B-HEADER", "I-HEADER", "B-QUESTION", "I-QUESTION", "B-ANSWER", "I-ANSWER"]
- )
- ),
- "image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"),
- "image_path": datasets.Value("string"),
- }
- ),
- supervised_keys=None,
- homepage="https://guillaumejaume.github.io/FUNSD/",
- citation=_CITATION,
- )
- def _split_generators(self, dl_manager):
- """Returns SplitGenerators."""
- downloaded_file = dl_manager.download_and_extract("https://guillaumejaume.github.io/FUNSD/dataset.zip")
- return [
- datasets.SplitGenerator(
- name=datasets.Split.TRAIN, gen_kwargs={"filepath": f"{downloaded_file}/dataset/training_data/"}
- ),
- datasets.SplitGenerator(
- name=datasets.Split.TEST, gen_kwargs={"filepath": f"{downloaded_file}/dataset/testing_data/"}
- ),
- ]
- def get_line_bbox(self, bboxs):
- x = [bboxs[i][j] for i in range(len(bboxs)) for j in range(0, len(bboxs[i]), 2)]
- y = [bboxs[i][j] for i in range(len(bboxs)) for j in range(1, len(bboxs[i]), 2)]
- x0, y0, x1, y1 = min(x), min(y), max(x), max(y)
- assert x1 >= x0 and y1 >= y0
- bbox = [[x0, y0, x1, y1] for _ in range(len(bboxs))]
- return bbox
- def _generate_examples(self, filepath):
- logger.info("⏳ Generating examples from = %s", filepath)
- ann_dir = os.path.join(filepath, "annotations")
- img_dir = os.path.join(filepath, "images")
- for guid, file in enumerate(sorted(os.listdir(ann_dir))):
- tokens = []
- bboxes = []
- ner_tags = []
- file_path = os.path.join(ann_dir, file)
- with open(file_path, "r", encoding="utf8") as f:
- data = json.load(f)
- image_path = os.path.join(img_dir, file)
- image_path = image_path.replace("json", "png")
- image, size = load_image(image_path)
- for item in data["form"]:
- cur_line_bboxes = []
- words, label = item["words"], item["label"]
- words = [w for w in words if w["text"].strip() != ""]
- if len(words) == 0:
- continue
- if label == "other":
- for w in words:
- tokens.append(w["text"])
- ner_tags.append("O")
- cur_line_bboxes.append(normalize_bbox(w["box"], size))
- else:
- tokens.append(words[0]["text"])
- ner_tags.append("B-" + label.upper())
- cur_line_bboxes.append(normalize_bbox(words[0]["box"], size))
- for w in words[1:]:
- tokens.append(w["text"])
- ner_tags.append("I-" + label.upper())
- cur_line_bboxes.append(normalize_bbox(w["box"], size))
- # by default: --segment_level_layout 1
- # if do not want to use segment_level_layout, comment the following line
- cur_line_bboxes = self.get_line_bbox(cur_line_bboxes)
- # box = normalize_bbox(item["box"], size)
- # cur_line_bboxes = [box for _ in range(len(words))]
- bboxes.extend(cur_line_bboxes)
- yield guid, {"id": str(guid), "tokens": tokens, "bboxes": bboxes, "ner_tags": ner_tags,
- "image": image, "image_path": image_path}
|