CoCalc -- dataset

GitHub Repository: amanchadha/coursera-natural-language-processing-specialization
Path: blob/master/4 - Natural Language Processing with Attention Models/Week 2/data/wikihow/all/1.2.0/dataset_info.json
⁶⁵ views
1
{
2
  "citation": "@misc{koupaee2018wikihow,\n    title={WikiHow: A Large Scale Text Summarization Dataset},\n    author={Mahnaz Koupaee and William Yang Wang},\n    year={2018},\n    eprint={1810.09305},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}",
3
  "description": "WikiHow is a new large-scale dataset using the online WikiHow\n(http://www.wikihow.com/) knowledge base.\n\nThere are two features:\n  - text: wikihow answers texts.\n  - headline: bold lines as summary.\n\nThere are two separate versions:\n  - all: consisting of the concatenation of all paragraphs as the articles and\n         the bold lines as the reference summaries.\n  - sep: consisting of each paragraph and its summary.\n\nDownload \"wikihowAll.csv\" and \"wikihowSep.csv\" from\nhttps://github.com/mahnazkoupaee/WikiHow-Dataset and place them in manual folder\nhttps://www.tensorflow.org/datasets/api_docs/python/tfds/download/DownloadConfig.\nTrain/validation/test splits are provided by the authors.\nPreprocessing is applied to remove short articles\n(abstract length < 0.75 article length) and clean up extra commas.",
4
  "downloadSize": "5460385",
5
  "location": {
6
    "urls": [
7
      "https://github.com/mahnazkoupaee/WikiHow-Dataset"
8
    ]
9
  },
10
  "name": "wikihow",
11
  "schema": {
12
    "feature": [
13
      {
14
        "name": "headline",
15
        "type": "BYTES"
16
      },
17
      {
18
        "name": "text",
19
        "type": "BYTES"
20
      },
21
      {
22
        "name": "title",
23
        "type": "BYTES"
24
      }
25
    ]
26
  },
27
  "sizeInBytes": "5460385",
28
  "splits": [
29
    {
30
      "name": "test",
31
      "numShards": "1",
32
      "shardLengths": [
33
        "5577"
34
      ],
35
      "statistics": {
36
        "features": [
37
          {
38
            "bytesStats": {
39
              "commonStats": {
40
                "numNonMissing": "5577"
41
              }
42
            },
43
            "name": "headline",
44
            "type": "BYTES"
45
          },
46
          {
47
            "bytesStats": {
48
              "commonStats": {
49
                "numNonMissing": "5577"
50
              }
51
            },
52
            "name": "text",
53
            "type": "BYTES"
54
          },
55
          {
56
            "bytesStats": {
57
              "commonStats": {
58
                "numNonMissing": "5577"
59
              }
60
            },
61
            "name": "title",
62
            "type": "BYTES"
63
          }
64
        ],
65
        "numExamples": "5577"
66
      }
67
    },
68
    {
69
      "name": "train",
70
      "numShards": "1",
71
      "shardLengths": [
72
        "39313",
73
        "39313",
74
        "39313",
75
        "39313"
76
      ],
77
      "statistics": {
78
        "features": [
79
          {
80
            "bytesStats": {
81
              "commonStats": {
82
                "numNonMissing": "157252"
83
              }
84
            },
85
            "name": "headline",
86
            "type": "BYTES"
87
          },
88
          {
89
            "bytesStats": {
90
              "commonStats": {
91
                "numNonMissing": "157252"
92
              }
93
            },
94
            "name": "text",
95
            "type": "BYTES"
96
          },
97
          {
98
            "bytesStats": {
99
              "commonStats": {
100
                "numNonMissing": "157252"
101
              }
102
            },
103
            "name": "title",
104
            "type": "BYTES"
105
          }
106
        ],
107
        "numExamples": "157252"
108
      }
109
    },
110
    {
111
      "name": "validation",
112
      "numShards": "1",
113
      "shardLengths": [
114
        "5599"
115
      ],
116
      "statistics": {
117
        "features": [
118
          {
119
            "bytesStats": {
120
              "commonStats": {
121
                "numNonMissing": "5599"
122
              }
123
            },
124
            "name": "headline",
125
            "type": "BYTES"
126
          },
127
          {
128
            "bytesStats": {
129
              "commonStats": {
130
                "numNonMissing": "5599"
131
              }
132
            },
133
            "name": "text",
134
            "type": "BYTES"
135
          },
136
          {
137
            "bytesStats": {
138
              "commonStats": {
139
                "numNonMissing": "5599"
140
              }
141
            },
142
            "name": "title",
143
            "type": "BYTES"
144
          }
145
        ],
146
        "numExamples": "5599"
147
      }
148
    }
149
  ],
150
  "supervisedKeys": {
151
    "input": "text",
152
    "output": "headline"
153
  },
154
  "version": "1.2.0"
155
}
156
Product

Resources

Company