CoCalc -- update_authors.pl

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/contrib/openzfs/scripts/update_authors.pl
¹⁰⁴⁸⁷⁴ views
1
#!/usr/bin/env perl
2

3
# SPDX-License-Identifier: MIT
4
#
5
# Copyright (c) 2023, Rob Norris <[email protected]>
6
#
7
# Permission is hereby granted, free of charge, to any person obtaining a copy
8
# of this software and associated documentation files (the "Software"), to
9
# deal in the Software without restriction, including without limitation the
10
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
11
# sell copies of the Software, and to permit persons to whom the Software is
12
# furnished to do so, subject to the following conditions:
13
#
14
# The above copyright notice and this permission notice shall be included in
15
# all copies or substantial portions of the Software.
16
#
17
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23
# IN THE SOFTWARE.
24

25

26
# This program will update the AUTHORS file to include commit authors that are
27
# in the git history but are not yet credited.
28
#
29
# The CONTRIBUTORS section of the AUTHORS file attempts to be a list of
30
# individual contributors to OpenZFS, with one name, address and line per
31
# person. This is good for readability, but does not really leave room for the
32
# that names and emails on commits from the same individual can be different,
33
# for all kinds of reasons, not limited to:
34
#
35
# - a person might change organisations, and so their email address changes
36
#
37
# - a person might be paid to work on OpenZFS for their employer, and then hack
38
#   on personal projects in the evening, so commits legitimately come from
39
#   different addresses
40
#
41
# - names change for all kinds of reasons
42
#
43
# To try and account for this, this program will try to find all the possible
44
# names and emails for a single contributor, and then select the "best" one to
45
# add to the AUTHORS file.
46
#
47
# The CONTRIBUTORS section of the AUTHORS file is considered the source of
48
# truth. Once an individual committer is listed in there, that line will not be
49
# removed regardless of what is discovered in the commit history. However, it
50
# can't just be _anything_. The name or email still has to match something seen
51
# in the commit history, so that we're able to undertand that its the same
52
# contributor.
53
#
54
# The bulk of the work is in running `git log` to fetch commit author names and
55
# emails. For each value, we generate a "slug" to use as an internal id for
56
# that value, which is mostly just the lowercase of the value with whitespace
57
# and punctuation removed. Two values with subtle differences can produce the
58
# same slug, so at this point we also try to keep the "best" pre-slug value as
59
# the display version. We use this slug to update two maps, one of email->name,
60
# the other of name->email.
61
#
62
# Where possible, we also consider Signed-off-by: trailers in the commit
63
# message, and if they match the commit author, enter them into the maps also.
64
# Because a commit can contain multiple signoffs, we only track one if either
65
# the name or the email address match the commit author (by slug). This is
66
# mostly aimed at letting an explicit signoff override a generated name or
67
# email on the same commit (usually a Github noreply), while avoiding every
68
# signoff ever being treated as a possible canonical ident for some other
69
# committer. (Also note that this behaviour only works for signoffs that can be
70
# extracted with git-interpret-trailers, which misses many seen in the OpenZFS
71
# git history, for various reasons).
72
#
73
# Once collected, we then walk all the emails we've seen and get all the names
74
# associated with every instance. Then for each of those names, we get all the
75
# emails associated, and so on until we've seen all the connected names and
76
# emails. This collection is every possible name and email for an individual
77
# contributor.
78
#
79
# Finaly, we consider these groups, and select the "best" name and email for
80
# the contributor, and add them to the author tables if they aren't there
81
# already. Once we've done everyone, we write out a new AUTHORS file, and
82
# that's the whole job.
83
#
84
# This is imperfect! Its necessary for the user to examine the diff and make
85
# sure its sensible. If it hasn't hooked up right, it may necessary to adjust
86
# the input data (via .mailmap) or improve the heuristics in this program. It
87
# took a long time to get into good shape when first written (355 new names
88
# added to AUTHORS!) but hopefully in the future we'll be running this
89
# regularly so it doesn't fall so far behind.
90

91

92
use 5.010;
93
use warnings;
94
use strict;
95

96
# Storage for the "best looking" version of name or email, keyed on slug.
97
my %display_name;
98
my %display_email;
99

100
# First, we load the existing AUTHORS file. We save everything before
101
# CONTRIBUTORS: line as-is so we can write it back out to the new file. Then
102
# we extract name,email pairs from the remainder and store them in a pair of
103
# hashtables, keyed on slug.
104
my %authors_name;
105
my %authors_email;
106

107
my @authors_header;
108

109
for my $line (do { local (@ARGV) = ('AUTHORS'); <> }) {
110
	chomp $line;
111
	state $in_header = 1;
112
	if ($in_header) {
113
		push @authors_header, $line;
114
		$in_header = 0 if $line =~ m/^CONTRIBUTORS:/;
115
	} else {
116
		my ($name, $email) = $line =~ m/^\s+(.+)(?= <) <([^>]+)/;
117
		next unless $name;
118

119
		my $semail = email_slug($email);
120
		my $sname = name_slug($name);
121

122
		$authors_name{$semail} = $sname;
123
		$authors_email{$sname} = $semail;
124

125
		# The name/email in AUTHORS is already the "best looking"
126
		# version, by definition.
127
		$display_name{$sname} = $name;
128
		$display_email{$semail} = $email;
129
	}
130
}
131

132
# Next, we load all the commit authors and signoff pairs, and form name<->email
133
# mappings, keyed on slug. Note that this format is getting the
134
# .mailmap-converted form. This lets us control the input to some extent by
135
# making changes there.
136
my %seen_names;
137
my %seen_emails;
138

139
# The true email address from commits, by slug. We do this so we can generate
140
# mailmap entries, which will only match the exact address from the commit,
141
# not anything "prettified". This lets us remember the prefix part of Github
142
# noreply addresses, while not including it in AUTHORS if that is truly the
143
# best option we have.
144
my %commit_email;
145

146
for my $line (reverse qx(git log --pretty=tformat:'%aN:::%aE:::%(trailers:key=signed-off-by,valueonly,separator=:::)')) {
147
	chomp $line;
148
	my ($name, $email, @signoffs) = split ':::', $line;
149
	next unless $name && $email;
150

151
	my $semail = email_slug($email);
152
	my $sname = name_slug($name);
153

154
	# Track the committer name and email.
155
	$seen_names{$semail}{$sname} = 1;
156
	$seen_emails{$sname}{$semail} = 1;
157

158
	# Keep the original commit address.
159
	$commit_email{$semail} = $email;
160

161
	# Consider if these are the best we've ever seen.
162
	update_display_name($name);
163
	update_display_email($email);
164

165
	# Check signoffs. any that have a matching name or email as the
166
	# committer (by slug), also track them.
167
	for my $signoff (@signoffs) {
168
		my ($soname, $soemail) = $signoff =~ m/^([^<]+)\s+<(.+)>$/;
169
		next unless $soname && $soemail;
170
		my $ssoname = name_slug($soname);
171
		my $ssoemail = email_slug($soemail);
172
		if (($semail eq $ssoemail) ^ ($sname eq $ssoname)) {
173
		    $seen_names{$ssoemail}{$ssoname} = 1;
174
		    $seen_emails{$ssoname}{$ssoemail} = 1;
175
		    update_display_name($soname);
176
		    update_display_email($soemail);
177
		}
178
	}
179
}
180

181
# Now collect unique committers by all names+emails we've ever seen for them.
182
# We start with emails and resolve all possible names, then we resolve the
183
# emails for those names, and round and round until there's nothing left.
184
my @committers;
185
for my $start_email (sort keys %seen_names) {
186
	# it might have been deleted already through a cross-reference
187
	next unless $seen_names{$start_email};
188

189
	my %emails;
190
	my %names;
191

192
	my @check_emails = ($start_email);
193
	my @check_names;
194
	while (@check_emails || @check_names) {
195
		while (my $email = shift @check_emails) {
196
			next if $emails{$email}++;
197
			push @check_names,
198
			    sort keys %{delete $seen_names{$email}};
199
		}
200
		while (my $name = shift @check_names) {
201
			next if $names{$name}++;
202
			push @check_emails,
203
			    sort keys %{delete $seen_emails{$name}};
204
		}
205
	}
206

207
	# A "committer" is the collection of connected names and emails.
208
	push @committers, [[sort keys %emails], [sort keys %names]];
209
}
210

211
# Now we have our committers, we can work out what to add to AUTHORS.
212
for my $committer (@committers) {
213
	my ($emails, $names) = @$committer;
214

215
	# If this commiter is already in AUTHORS, we must not touch.
216
	next if grep { $authors_name{$_} } @$emails;
217
	next if grep { $authors_email{$_} } @$names;
218

219
	# Decide on the "best" name and email to use
220
	my $email = best_email(@$emails);
221
	my $name = best_name(@$names);
222

223
	$authors_email{$name} = $email;
224
	$authors_name{$email} = $name;
225

226
	# We've now selected our canonical name going forward. If there
227
	# were other options from commit authors only (not signoffs),
228
	# emit mailmap lines for the user to past into .mailmap
229
	my $cemail = $display_email{email_slug($authors_email{$name})};
230
	for my $alias (@$emails) {
231
		next if $alias eq $email;
232

233
		my $calias = $commit_email{$alias};
234
		next unless $calias;
235

236
		my $cname = $display_name{$name};
237
		say "$cname <$cemail> <$calias>";
238
	}
239
}
240

241
# Now output the new AUTHORS file
242
open my $fh, '>', 'AUTHORS' or die "E: couldn't open AUTHORS for write: $!\n";
243
say $fh join("\n", @authors_header, "");
244
for my $name (sort keys %authors_email) {
245
	my $cname = $display_name{$name};
246
	my $cemail = $display_email{email_slug($authors_email{$name})};
247
	say $fh "    $cname <$cemail>";
248
}
249

250
exit 0;
251

252
# "Slugs" are used at the hashtable key for names and emails. They are used to
253
# making two variants of a value be the "same" for matching. Mostly this is
254
# to make upper and lower-case versions of a name or email compare the same,
255
# but we do a little bit of munging to handle some common cases.
256
#
257
# Note that these are only used for matching internally; for display, the
258
# slug will be used to look up the display form.
259
sub name_slug {
260
	my ($name) = @_;
261

262
	# Remove spaces and dots, to handle differences in initials.
263
	$name =~ s/[\s\.]//g;
264

265
	return lc $name;
266
}
267
sub email_slug {
268
	my ($email) = @_;
269

270
	# Remove everything up to and including the first space, and the last
271
	# space and everything after it.
272
	$email =~ s/^(.*\s+)|(\s+.*)$//g;
273

274
	# Remove the leading userid+ on Github noreply addresses. They're
275
	# optional and we want to treat them as the same thing.
276
	$email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/;
277

278
	return lc $email;
279
}
280

281
# As we accumulate new names and addresses, record the "best looking" version
282
# of each. Once we decide to add a committer to AUTHORS, we'll take the best
283
# version of their name and address from here.
284
#
285
# Note that we don't record them if they're already in AUTHORS (that is, in
286
# %authors_name or %authors_email) because that file already contains the
287
# "best" version, by definition. So we return immediately if we've seen it
288
# there already.
289
sub update_display_name {
290
	my ($name) = @_;
291
	my $sname = name_slug($name);
292
	return if $authors_email{$sname};
293

294
	# For names, "more specific" means "has more non-lower-case characters"
295
	# (in ASCII), guessing that if a person has gone to some effort to
296
	# specialise their name in a later commit, they presumably care more
297
	# about it. If this is wrong, its probably better to add a .mailmap
298
	# entry.
299

300
	my $cname = $display_name{$sname};
301
	if (!$cname ||
302
	    ($name =~ tr/a-z //) < ($cname =~ tr/a-z //)) {
303
		$display_name{$sname} = $name;
304
	}
305
}
306
sub update_display_email {
307
	my ($email) = @_;
308
	my $semail = email_slug($email);
309
	return if $authors_name{$semail};
310

311
	# Like names, we prefer uppercase when possible. We also remove any
312
	# leading "plus address" for Github noreply addresses.
313

314
	$email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/;
315

316
	my $cemail = $display_email{$semail};
317
	if (!$cemail ||
318
	    ($email =~ tr/a-z //) < ($cemail =~ tr/a-z //)) {
319
		$display_email{$semail} = $email;
320
	}
321
}
322

323
sub best_name {
324
	my @names = sort {
325
		my $cmp;
326
		my ($aa) = $display_name{$a};
327
		my ($bb) = $display_name{$b};
328

329
		# The "best" name is very subjective, and a simple sort
330
		# produced good-enough results, so I didn't try harder. Use of
331
		# accented characters, punctuation and caps are probably an
332
		# indicator of "better", but possibly we should also take into
333
		# account the most recent name we saw, in case the committer
334
		# has changed their name or nickname or similar.
335
		#
336
		# Really, .mailmap is the place to control this.
337

338
		return ($aa cmp $bb);
339
	} @_;
340

341
	return shift @names;
342
}
343
sub best_email {
344
	state $internal_re = qr/\.(?:internal|local|\(none\))$/;
345
	state $noreply_re  = qr/\.noreply\.github\.com$/;
346
	state $freemail_re = qr/\@(?:gmail|hotmail)\.com$/;
347

348
	my @emails = sort {
349
		my $cmp;
350

351
		# prefer address with a single @ over those without
352
		$cmp = (($b =~ tr/@//) == 1) <=> (($a =~ tr/@//) == 1);
353
		return $cmp unless $cmp == 0;
354

355
		# prefer any address over internal/local addresses
356
		$cmp = (($a =~ $internal_re) <=> ($b =~ $internal_re));
357
		return $cmp unless $cmp == 0;
358

359
		# prefer any address over github noreply aliases
360
		$cmp = (($a =~ $noreply_re) <=> ($b =~ $noreply_re));
361
		return $cmp unless $cmp == 0;
362

363
		# prefer any address over freemail providers
364
		$cmp = (($a =~ $freemail_re) <=> ($b =~ $freemail_re));
365
		return $cmp unless $cmp == 0;
366

367
		# alphabetical by domain
368
		my ($alocal, $adom) = split /\@/, $a;
369
		my ($blocal, $bdom) = split /\@/, $b;
370
		$cmp = ($adom cmp $bdom);
371
		return $cmp unless $cmp == 0;
372

373
		# alphabetical by local part
374
		return ($alocal cmp $blocal);
375
	} @_;
376

377
	return shift @emails;
378
}
379

380
Product

Resources

Company