Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
amanchadha
GitHub Repository: amanchadha/coursera-natural-language-processing-specialization
Path: blob/master/4 - Natural Language Processing with Attention Models/Week 2/data/wikihow/all/1.2.0/dataset_info.json
65 views
1
{
2
"citation": "@misc{koupaee2018wikihow,\n title={WikiHow: A Large Scale Text Summarization Dataset},\n author={Mahnaz Koupaee and William Yang Wang},\n year={2018},\n eprint={1810.09305},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}",
3
"description": "WikiHow is a new large-scale dataset using the online WikiHow\n(http://www.wikihow.com/) knowledge base.\n\nThere are two features:\n - text: wikihow answers texts.\n - headline: bold lines as summary.\n\nThere are two separate versions:\n - all: consisting of the concatenation of all paragraphs as the articles and\n the bold lines as the reference summaries.\n - sep: consisting of each paragraph and its summary.\n\nDownload \"wikihowAll.csv\" and \"wikihowSep.csv\" from\nhttps://github.com/mahnazkoupaee/WikiHow-Dataset and place them in manual folder\nhttps://www.tensorflow.org/datasets/api_docs/python/tfds/download/DownloadConfig.\nTrain/validation/test splits are provided by the authors.\nPreprocessing is applied to remove short articles\n(abstract length < 0.75 article length) and clean up extra commas.",
4
"downloadSize": "5460385",
5
"location": {
6
"urls": [
7
"https://github.com/mahnazkoupaee/WikiHow-Dataset"
8
]
9
},
10
"name": "wikihow",
11
"schema": {
12
"feature": [
13
{
14
"name": "headline",
15
"type": "BYTES"
16
},
17
{
18
"name": "text",
19
"type": "BYTES"
20
},
21
{
22
"name": "title",
23
"type": "BYTES"
24
}
25
]
26
},
27
"sizeInBytes": "5460385",
28
"splits": [
29
{
30
"name": "test",
31
"numShards": "1",
32
"shardLengths": [
33
"5577"
34
],
35
"statistics": {
36
"features": [
37
{
38
"bytesStats": {
39
"commonStats": {
40
"numNonMissing": "5577"
41
}
42
},
43
"name": "headline",
44
"type": "BYTES"
45
},
46
{
47
"bytesStats": {
48
"commonStats": {
49
"numNonMissing": "5577"
50
}
51
},
52
"name": "text",
53
"type": "BYTES"
54
},
55
{
56
"bytesStats": {
57
"commonStats": {
58
"numNonMissing": "5577"
59
}
60
},
61
"name": "title",
62
"type": "BYTES"
63
}
64
],
65
"numExamples": "5577"
66
}
67
},
68
{
69
"name": "train",
70
"numShards": "1",
71
"shardLengths": [
72
"39313",
73
"39313",
74
"39313",
75
"39313"
76
],
77
"statistics": {
78
"features": [
79
{
80
"bytesStats": {
81
"commonStats": {
82
"numNonMissing": "157252"
83
}
84
},
85
"name": "headline",
86
"type": "BYTES"
87
},
88
{
89
"bytesStats": {
90
"commonStats": {
91
"numNonMissing": "157252"
92
}
93
},
94
"name": "text",
95
"type": "BYTES"
96
},
97
{
98
"bytesStats": {
99
"commonStats": {
100
"numNonMissing": "157252"
101
}
102
},
103
"name": "title",
104
"type": "BYTES"
105
}
106
],
107
"numExamples": "157252"
108
}
109
},
110
{
111
"name": "validation",
112
"numShards": "1",
113
"shardLengths": [
114
"5599"
115
],
116
"statistics": {
117
"features": [
118
{
119
"bytesStats": {
120
"commonStats": {
121
"numNonMissing": "5599"
122
}
123
},
124
"name": "headline",
125
"type": "BYTES"
126
},
127
{
128
"bytesStats": {
129
"commonStats": {
130
"numNonMissing": "5599"
131
}
132
},
133
"name": "text",
134
"type": "BYTES"
135
},
136
{
137
"bytesStats": {
138
"commonStats": {
139
"numNonMissing": "5599"
140
}
141
},
142
"name": "title",
143
"type": "BYTES"
144
}
145
],
146
"numExamples": "5599"
147
}
148
}
149
],
150
"supervisedKeys": {
151
"input": "text",
152
"output": "headline"
153
},
154
"version": "1.2.0"
155
}
156