Path: blob/master/4 - Natural Language Processing with Attention Models/Week 2/data/wikihow/all/1.2.0/dataset_info.json
65 views
{1"citation": "@misc{koupaee2018wikihow,\n title={WikiHow: A Large Scale Text Summarization Dataset},\n author={Mahnaz Koupaee and William Yang Wang},\n year={2018},\n eprint={1810.09305},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}",2"description": "WikiHow is a new large-scale dataset using the online WikiHow\n(http://www.wikihow.com/) knowledge base.\n\nThere are two features:\n - text: wikihow answers texts.\n - headline: bold lines as summary.\n\nThere are two separate versions:\n - all: consisting of the concatenation of all paragraphs as the articles and\n the bold lines as the reference summaries.\n - sep: consisting of each paragraph and its summary.\n\nDownload \"wikihowAll.csv\" and \"wikihowSep.csv\" from\nhttps://github.com/mahnazkoupaee/WikiHow-Dataset and place them in manual folder\nhttps://www.tensorflow.org/datasets/api_docs/python/tfds/download/DownloadConfig.\nTrain/validation/test splits are provided by the authors.\nPreprocessing is applied to remove short articles\n(abstract length < 0.75 article length) and clean up extra commas.",3"downloadSize": "5460385",4"location": {5"urls": [6"https://github.com/mahnazkoupaee/WikiHow-Dataset"7]8},9"name": "wikihow",10"schema": {11"feature": [12{13"name": "headline",14"type": "BYTES"15},16{17"name": "text",18"type": "BYTES"19},20{21"name": "title",22"type": "BYTES"23}24]25},26"sizeInBytes": "5460385",27"splits": [28{29"name": "test",30"numShards": "1",31"shardLengths": [32"5577"33],34"statistics": {35"features": [36{37"bytesStats": {38"commonStats": {39"numNonMissing": "5577"40}41},42"name": "headline",43"type": "BYTES"44},45{46"bytesStats": {47"commonStats": {48"numNonMissing": "5577"49}50},51"name": "text",52"type": "BYTES"53},54{55"bytesStats": {56"commonStats": {57"numNonMissing": "5577"58}59},60"name": "title",61"type": "BYTES"62}63],64"numExamples": "5577"65}66},67{68"name": "train",69"numShards": "1",70"shardLengths": [71"39313",72"39313",73"39313",74"39313"75],76"statistics": {77"features": [78{79"bytesStats": {80"commonStats": {81"numNonMissing": "157252"82}83},84"name": "headline",85"type": "BYTES"86},87{88"bytesStats": {89"commonStats": {90"numNonMissing": "157252"91}92},93"name": "text",94"type": "BYTES"95},96{97"bytesStats": {98"commonStats": {99"numNonMissing": "157252"100}101},102"name": "title",103"type": "BYTES"104}105],106"numExamples": "157252"107}108},109{110"name": "validation",111"numShards": "1",112"shardLengths": [113"5599"114],115"statistics": {116"features": [117{118"bytesStats": {119"commonStats": {120"numNonMissing": "5599"121}122},123"name": "headline",124"type": "BYTES"125},126{127"bytesStats": {128"commonStats": {129"numNonMissing": "5599"130}131},132"name": "text",133"type": "BYTES"134},135{136"bytesStats": {137"commonStats": {138"numNonMissing": "5599"139}140},141"name": "title",142"type": "BYTES"143}144],145"numExamples": "5599"146}147}148],149"supervisedKeys": {150"input": "text",151"output": "headline"152},153"version": "1.2.0"154}155156