Spaces:
Running
Running
File size: 1,478 Bytes
93bc8ec 5fd26bb 55de44e 93bc8ec 5fd26bb 55de44e 93bc8ec 5fd26bb 55de44e 5fd26bb 55de44e 5fd26bb 55de44e 93bc8ec 5fd26bb 55de44e 5fd26bb 55de44e 93bc8ec 55de44e 93bc8ec 55de44e 93bc8ec 55de44e 93bc8ec 55de44e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import os
from bs4 import BeautifulSoup
from document_qa.grobid_processors import get_xml_nodes_body, get_xml_nodes_figures, get_xml_nodes_header
from tests.resources import TEST_DATA_PATH
def test_get_xml_nodes_body_paragraphs():
with open(os.path.join(TEST_DATA_PATH, "2312.07559.paragraphs.tei.xml"), 'r') as fo:
soup = BeautifulSoup(fo, 'xml')
nodes = get_xml_nodes_body(soup, use_paragraphs=True)
assert len(nodes) == 70
def test_get_xml_nodes_body_sentences():
with open(os.path.join(TEST_DATA_PATH, "2312.07559.sentences.tei.xml"), 'r') as fo:
soup = BeautifulSoup(fo, 'xml')
children = get_xml_nodes_body(soup, use_paragraphs=False)
assert len(children) == 327
def test_get_xml_nodes_figures():
with open(os.path.join(TEST_DATA_PATH, "2312.07559.paragraphs.tei.xml"), 'r') as fo:
soup = BeautifulSoup(fo, 'xml')
children = get_xml_nodes_figures(soup)
assert len(children) == 13
def test_get_xml_nodes_header_paragraphs():
with open(os.path.join(TEST_DATA_PATH, "2312.07559.paragraphs.tei.xml"), 'r') as fo:
soup = BeautifulSoup(fo, 'xml')
children = get_xml_nodes_header(soup)
assert len(children) == 8
def test_get_xml_nodes_header_sentences():
with open(os.path.join(TEST_DATA_PATH, "2312.07559.sentences.tei.xml"), 'r') as fo:
soup = BeautifulSoup(fo, 'xml')
children = get_xml_nodes_header(soup, use_paragraphs=False)
assert len(children) == 15
|