tokenizer-arena / stats /compression_rate /ClassCat.gpt2-base-french @ cc100.en.diff.json
xu-song's picture
add compression_rate details
a4208a2
[
{
"text": "Belmont Estate is on the market for $63 million and boasts roughly 22,000 square feet of luxurious finishes and elaborate architecture on 1.28 acres. Listed on Thursday, the home is being sold by high-end real estate firm Sotheby’s International Realty Canada.",
"decoded_text": "belmont estate is on the market for $63 million and boasts roughly 22,000 square feet of luxurious finishes and elaborate architecture on 1.28 acres. listed on thursday, the home is being sold by high-end real estate firm sotheby’s international realty canada.",
"diff": [
"replace text[0:9] --> decoded_text[0:9] 'Belmont E' --> 'belmont e'",
"replace text[150:161] --> decoded_text[150:161] 'Listed on T' --> 'listed on t'",
"replace text[222:223] --> decoded_text[222:223] 'S' --> 's'",
"replace text[232:254] --> decoded_text[232:254] 'International Realty C' --> 'international realty c'"
],
"n_oov_chars": 8,
"oov_ratio": 0.03076923076923077,
"oov_charset": "[\"B\", \"E\", \"L\", \"T\", \"S\", \"I\", \"R\", \"C\"]"
},
{
"text": "“Within the city we’ve had homes that have sold for $56 million, $33 million, $31 million but this will be the record of the offering price,” listing agent Christa Frosch of Sotheby’s tells BuzzBuzzNews.",
"decoded_text": "“within the city we’ve had homes that have sold for $56 million, $33 million, $31 million but this will be the record of the offering price,” listing agent christa frosch of sotheby’s tells buzzbuzznews.",
"diff": [
"replace text[1:2] --> decoded_text[1:2] 'W' --> 'w'",
"replace text[156:175] --> decoded_text[156:175] 'Christa Frosch of S' --> 'christa frosch of s'",
"replace text[190:191] --> decoded_text[190:191] 'B' --> 'b'",
"replace text[194:195] --> decoded_text[194:195] 'B' --> 'b'",
"replace text[198:199] --> decoded_text[198:199] 'N' --> 'n'"
],
"n_oov_chars": 7,
"oov_ratio": 0.034482758620689655,
"oov_charset": "[\"W\", \"C\", \"F\", \"S\", \"B\", \"N\"]"
},
{
"text": "The three-storey home has five bedrooms, twelve bathrooms and an elevator in the west wing. Built to entertain, two main gallery halls can seat up to 100 guests. The Italian-inspired kitchen includes a fireplace and walls and ceilings throughout the home feature murals and artwork. Lavish amenities include an indoor pool and sauna, a six-car garage and a private entrance in-law’s suite.",
"decoded_text": "the three-storey home has five bedrooms, twelve bathrooms and an elevator in the west wing. built to entertain, two main gallery halls can seat up to 100 guests. the italian-inspired kitchen includes a fireplace and walls and ceilings throughout the home feature murals and artwork. lavish amenities include an indoor pool and sauna, a six-car garage and a private entrance in-law’s suite.",
"diff": [
"replace text[0:1] --> decoded_text[0:1] 'T' --> 't'",
"replace text[92:93] --> decoded_text[92:93] 'B' --> 'b'",
"replace text[162:167] --> decoded_text[162:167] 'The I' --> 'the i'",
"replace text[283:284] --> decoded_text[283:284] 'L' --> 'l'"
],
"n_oov_chars": 5,
"oov_ratio": 0.012853470437017995,
"oov_charset": "[\"T\", \"B\", \"I\", \"L\"]"
},
{
"text": "Surrounding the property is a Versailles-inspired garden with a variety of trees, plants and an orchard. In the spring, over 12,000 flowers bloom in the tiered, three-level garden.",
"decoded_text": "surrounding the property is a versailles-inspired garden with a variety of trees, plants and an orchard. in the spring, over 12,000 flowers bloom in the tiered, three-level garden.",
"diff": [
"replace text[0:1] --> decoded_text[0:1] 'S' --> 's'",
"replace text[30:31] --> decoded_text[30:31] 'V' --> 'v'",
"replace text[105:106] --> decoded_text[105:106] 'I' --> 'i'"
],
"n_oov_chars": 3,
"oov_ratio": 0.016666666666666666,
"oov_charset": "[\"S\", \"V\", \"I\"]"
},
{
"text": "According to Frosch, the listing has received global attention and, despite being on the market for only 24 hours, buyers are already showing interest.",
"decoded_text": "according to frosch, the listing has received global attention and, despite being on the market for only 24 hours, buyers are already showing interest.",
"diff": [
"replace text[0:1] --> decoded_text[0:1] 'A' --> 'a'",
"replace text[13:14] --> decoded_text[13:14] 'F' --> 'f'"
],
"n_oov_chars": 2,
"oov_ratio": 0.013245033112582781,
"oov_charset": "[\"A\", \"F\"]"
},
{
"text": "“We just went to the market yesterday, it’s private through Sotheby’s and we’ve already started to get calls,” says Frosch.",
"decoded_text": "“we just went to the market yesterday, it’s private through sotheby’s and we’ve already started to get calls,” says frosch.",
"diff": [
"replace text[1:2] --> decoded_text[1:2] 'W' --> 'w'",
"replace text[60:61] --> decoded_text[60:61] 'S' --> 's'",
"replace text[116:117] --> decoded_text[116:117] 'F' --> 'f'"
],
"n_oov_chars": 3,
"oov_ratio": 0.024390243902439025,
"oov_charset": "[\"W\", \"S\", \"F\"]"
},
{
"text": "Stay well hydrated—that means you should include about 48- 64 ounces of liquid (non-calorie) each day. You will be drinking small amounts (“sips”) every hour through the day since you will not be able to drink a large amount all at once.",
"decoded_text": "stay well hydrated—that means you should include about 48- 64 ounces of liquid (non-calorie) each day. you will be drinking small amounts (“sips”) every hour through the day since you will not be able to drink a large amount all at once.",
"diff": [
"replace text[0:1] --> decoded_text[0:1] 'S' --> 's'",
"replace text[103:104] --> decoded_text[103:104] 'Y' --> 'y'"
],
"n_oov_chars": 2,
"oov_ratio": 0.008438818565400843,
"oov_charset": "[\"S\", \"Y\"]"
},
{
"text": "On Day Four after your surgery, begin adding liquid protein during this Phase (20 to 30 grams per day). That means you should buy a protein powder suggested by your dietitian, doctor or nurse. We recommend:",
"decoded_text": "on day four after your surgery, begin adding liquid protein during this phase (20 to 30 grams per day). that means you should buy a protein powder suggested by your dietitian, doctor or nurse. we recommend:",
"diff": [
"replace text[0:13] --> decoded_text[0:7] 'On Day Four a' --> 'on day '",
"insert text[14:14] --> decoded_text[8:14] '' --> 'our af'",
"replace text[72:73] --> decoded_text[72:73] 'P' --> 'p'",
"replace text[104:105] --> decoded_text[104:105] 'T' --> 't'",
"replace text[193:194] --> decoded_text[193:194] 'W' --> 'w'"
],
"n_oov_chars": 6,
"oov_ratio": 0.02912621359223301,
"oov_charset": "[\"O\", \"D\", \"F\", \"P\", \"T\", \"W\"]"
},
{
"text": "Unjury (20grams of whey protein isolate per packet) once a day (available at our office or www.UNJURY.com or 800-517-5111",
"decoded_text": "unjury (20grams of whey protein isolate per packet) once a day (available at our office or www.unjury.com or 800-517-5111",
"diff": [
"replace text[0:1] --> decoded_text[0:1] 'U' --> 'u'",
"replace text[95:101] --> decoded_text[95:101] 'UNJURY' --> 'unjury'"
],
"n_oov_chars": 7,
"oov_ratio": 0.05785123966942149,
"oov_charset": "[\"U\", \"N\", \"J\", \"R\", \"Y\"]"
},
{
"text": "Watch for signs of dehydration!",
"decoded_text": "watch for signs of dehydration!",
"diff": [
"replace text[0:1] --> decoded_text[0:1] 'W' --> 'w'"
],
"n_oov_chars": 1,
"oov_ratio": 0.03225806451612903,
"oov_charset": "[\"W\"]"
}
]