Path: blob/main/sys/contrib/openzfs/scripts/update_authors.pl
48261 views
#!/usr/bin/env perl12# SPDX-License-Identifier: MIT3#4# Copyright (c) 2023, Rob Norris <[email protected]>5#6# Permission is hereby granted, free of charge, to any person obtaining a copy7# of this software and associated documentation files (the "Software"), to8# deal in the Software without restriction, including without limitation the9# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or10# sell copies of the Software, and to permit persons to whom the Software is11# furnished to do so, subject to the following conditions:12#13# The above copyright notice and this permission notice shall be included in14# all copies or substantial portions of the Software.15#16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE19# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS22# IN THE SOFTWARE.232425# This program will update the AUTHORS file to include commit authors that are26# in the git history but are not yet credited.27#28# The CONTRIBUTORS section of the AUTHORS file attempts to be a list of29# individual contributors to OpenZFS, with one name, address and line per30# person. This is good for readability, but does not really leave room for the31# that names and emails on commits from the same individual can be different,32# for all kinds of reasons, not limited to:33#34# - a person might change organisations, and so their email address changes35#36# - a person might be paid to work on OpenZFS for their employer, and then hack37# on personal projects in the evening, so commits legitimately come from38# different addresses39#40# - names change for all kinds of reasons41#42# To try and account for this, this program will try to find all the possible43# names and emails for a single contributor, and then select the "best" one to44# add to the AUTHORS file.45#46# The CONTRIBUTORS section of the AUTHORS file is considered the source of47# truth. Once an individual committer is listed in there, that line will not be48# removed regardless of what is discovered in the commit history. However, it49# can't just be _anything_. The name or email still has to match something seen50# in the commit history, so that we're able to undertand that its the same51# contributor.52#53# The bulk of the work is in running `git log` to fetch commit author names and54# emails. For each value, we generate a "slug" to use as an internal id for55# that value, which is mostly just the lowercase of the value with whitespace56# and punctuation removed. Two values with subtle differences can produce the57# same slug, so at this point we also try to keep the "best" pre-slug value as58# the display version. We use this slug to update two maps, one of email->name,59# the other of name->email.60#61# Where possible, we also consider Signed-off-by: trailers in the commit62# message, and if they match the commit author, enter them into the maps also.63# Because a commit can contain multiple signoffs, we only track one if either64# the name or the email address match the commit author (by slug). This is65# mostly aimed at letting an explicit signoff override a generated name or66# email on the same commit (usually a Github noreply), while avoiding every67# signoff ever being treated as a possible canonical ident for some other68# committer. (Also note that this behaviour only works for signoffs that can be69# extracted with git-interpret-trailers, which misses many seen in the OpenZFS70# git history, for various reasons).71#72# Once collected, we then walk all the emails we've seen and get all the names73# associated with every instance. Then for each of those names, we get all the74# emails associated, and so on until we've seen all the connected names and75# emails. This collection is every possible name and email for an individual76# contributor.77#78# Finaly, we consider these groups, and select the "best" name and email for79# the contributor, and add them to the author tables if they aren't there80# already. Once we've done everyone, we write out a new AUTHORS file, and81# that's the whole job.82#83# This is imperfect! Its necessary for the user to examine the diff and make84# sure its sensible. If it hasn't hooked up right, it may necessary to adjust85# the input data (via .mailmap) or improve the heuristics in this program. It86# took a long time to get into good shape when first written (355 new names87# added to AUTHORS!) but hopefully in the future we'll be running this88# regularly so it doesn't fall so far behind.899091use 5.010;92use warnings;93use strict;9495# Storage for the "best looking" version of name or email, keyed on slug.96my %display_name;97my %display_email;9899# First, we load the existing AUTHORS file. We save everything before100# CONTRIBUTORS: line as-is so we can write it back out to the new file. Then101# we extract name,email pairs from the remainder and store them in a pair of102# hashtables, keyed on slug.103my %authors_name;104my %authors_email;105106my @authors_header;107108for my $line (do { local (@ARGV) = ('AUTHORS'); <> }) {109chomp $line;110state $in_header = 1;111if ($in_header) {112push @authors_header, $line;113$in_header = 0 if $line =~ m/^CONTRIBUTORS:/;114} else {115my ($name, $email) = $line =~ m/^\s+(.+)(?= <) <([^>]+)/;116next unless $name;117118my $semail = email_slug($email);119my $sname = name_slug($name);120121$authors_name{$semail} = $sname;122$authors_email{$sname} = $semail;123124# The name/email in AUTHORS is already the "best looking"125# version, by definition.126$display_name{$sname} = $name;127$display_email{$semail} = $email;128}129}130131# Next, we load all the commit authors and signoff pairs, and form name<->email132# mappings, keyed on slug. Note that this format is getting the133# .mailmap-converted form. This lets us control the input to some extent by134# making changes there.135my %seen_names;136my %seen_emails;137138# The true email address from commits, by slug. We do this so we can generate139# mailmap entries, which will only match the exact address from the commit,140# not anything "prettified". This lets us remember the prefix part of Github141# noreply addresses, while not including it in AUTHORS if that is truly the142# best option we have.143my %commit_email;144145for my $line (reverse qx(git log --pretty=tformat:'%aN:::%aE:::%(trailers:key=signed-off-by,valueonly,separator=:::)')) {146chomp $line;147my ($name, $email, @signoffs) = split ':::', $line;148next unless $name && $email;149150my $semail = email_slug($email);151my $sname = name_slug($name);152153# Track the committer name and email.154$seen_names{$semail}{$sname} = 1;155$seen_emails{$sname}{$semail} = 1;156157# Keep the original commit address.158$commit_email{$semail} = $email;159160# Consider if these are the best we've ever seen.161update_display_name($name);162update_display_email($email);163164# Check signoffs. any that have a matching name or email as the165# committer (by slug), also track them.166for my $signoff (@signoffs) {167my ($soname, $soemail) = $signoff =~ m/^([^<]+)\s+<(.+)>$/;168next unless $soname && $soemail;169my $ssoname = name_slug($soname);170my $ssoemail = email_slug($soemail);171if (($semail eq $ssoemail) ^ ($sname eq $ssoname)) {172$seen_names{$ssoemail}{$ssoname} = 1;173$seen_emails{$ssoname}{$ssoemail} = 1;174update_display_name($soname);175update_display_email($soemail);176}177}178}179180# Now collect unique committers by all names+emails we've ever seen for them.181# We start with emails and resolve all possible names, then we resolve the182# emails for those names, and round and round until there's nothing left.183my @committers;184for my $start_email (sort keys %seen_names) {185# it might have been deleted already through a cross-reference186next unless $seen_names{$start_email};187188my %emails;189my %names;190191my @check_emails = ($start_email);192my @check_names;193while (@check_emails || @check_names) {194while (my $email = shift @check_emails) {195next if $emails{$email}++;196push @check_names,197sort keys %{delete $seen_names{$email}};198}199while (my $name = shift @check_names) {200next if $names{$name}++;201push @check_emails,202sort keys %{delete $seen_emails{$name}};203}204}205206# A "committer" is the collection of connected names and emails.207push @committers, [[sort keys %emails], [sort keys %names]];208}209210# Now we have our committers, we can work out what to add to AUTHORS.211for my $committer (@committers) {212my ($emails, $names) = @$committer;213214# If this commiter is already in AUTHORS, we must not touch.215next if grep { $authors_name{$_} } @$emails;216next if grep { $authors_email{$_} } @$names;217218# Decide on the "best" name and email to use219my $email = best_email(@$emails);220my $name = best_name(@$names);221222$authors_email{$name} = $email;223$authors_name{$email} = $name;224225# We've now selected our canonical name going forward. If there226# were other options from commit authors only (not signoffs),227# emit mailmap lines for the user to past into .mailmap228my $cemail = $display_email{email_slug($authors_email{$name})};229for my $alias (@$emails) {230next if $alias eq $email;231232my $calias = $commit_email{$alias};233next unless $calias;234235my $cname = $display_name{$name};236say "$cname <$cemail> <$calias>";237}238}239240# Now output the new AUTHORS file241open my $fh, '>', 'AUTHORS' or die "E: couldn't open AUTHORS for write: $!\n";242say $fh join("\n", @authors_header, "");243for my $name (sort keys %authors_email) {244my $cname = $display_name{$name};245my $cemail = $display_email{email_slug($authors_email{$name})};246say $fh " $cname <$cemail>";247}248249exit 0;250251# "Slugs" are used at the hashtable key for names and emails. They are used to252# making two variants of a value be the "same" for matching. Mostly this is253# to make upper and lower-case versions of a name or email compare the same,254# but we do a little bit of munging to handle some common cases.255#256# Note that these are only used for matching internally; for display, the257# slug will be used to look up the display form.258sub name_slug {259my ($name) = @_;260261# Remove spaces and dots, to handle differences in initials.262$name =~ s/[\s\.]//g;263264return lc $name;265}266sub email_slug {267my ($email) = @_;268269# Remove everything up to and including the first space, and the last270# space and everything after it.271$email =~ s/^(.*\s+)|(\s+.*)$//g;272273# Remove the leading userid+ on Github noreply addresses. They're274# optional and we want to treat them as the same thing.275$email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/;276277return lc $email;278}279280# As we accumulate new names and addresses, record the "best looking" version281# of each. Once we decide to add a committer to AUTHORS, we'll take the best282# version of their name and address from here.283#284# Note that we don't record them if they're already in AUTHORS (that is, in285# %authors_name or %authors_email) because that file already contains the286# "best" version, by definition. So we return immediately if we've seen it287# there already.288sub update_display_name {289my ($name) = @_;290my $sname = name_slug($name);291return if $authors_email{$sname};292293# For names, "more specific" means "has more non-lower-case characters"294# (in ASCII), guessing that if a person has gone to some effort to295# specialise their name in a later commit, they presumably care more296# about it. If this is wrong, its probably better to add a .mailmap297# entry.298299my $cname = $display_name{$sname};300if (!$cname ||301($name =~ tr/a-z //) < ($cname =~ tr/a-z //)) {302$display_name{$sname} = $name;303}304}305sub update_display_email {306my ($email) = @_;307my $semail = email_slug($email);308return if $authors_name{$semail};309310# Like names, we prefer uppercase when possible. We also remove any311# leading "plus address" for Github noreply addresses.312313$email =~ s/^[^\+]*\+//g if $email =~ m/\.noreply\.github\.com$/;314315my $cemail = $display_email{$semail};316if (!$cemail ||317($email =~ tr/a-z //) < ($cemail =~ tr/a-z //)) {318$display_email{$semail} = $email;319}320}321322sub best_name {323my @names = sort {324my $cmp;325my ($aa) = $display_name{$a};326my ($bb) = $display_name{$b};327328# The "best" name is very subjective, and a simple sort329# produced good-enough results, so I didn't try harder. Use of330# accented characters, punctuation and caps are probably an331# indicator of "better", but possibly we should also take into332# account the most recent name we saw, in case the committer333# has changed their name or nickname or similar.334#335# Really, .mailmap is the place to control this.336337return ($aa cmp $bb);338} @_;339340return shift @names;341}342sub best_email {343state $internal_re = qr/\.(?:internal|local|\(none\))$/;344state $noreply_re = qr/\.noreply\.github\.com$/;345state $freemail_re = qr/\@(?:gmail|hotmail)\.com$/;346347my @emails = sort {348my $cmp;349350# prefer address with a single @ over those without351$cmp = (($b =~ tr/@//) == 1) <=> (($a =~ tr/@//) == 1);352return $cmp unless $cmp == 0;353354# prefer any address over internal/local addresses355$cmp = (($a =~ $internal_re) <=> ($b =~ $internal_re));356return $cmp unless $cmp == 0;357358# prefer any address over github noreply aliases359$cmp = (($a =~ $noreply_re) <=> ($b =~ $noreply_re));360return $cmp unless $cmp == 0;361362# prefer any address over freemail providers363$cmp = (($a =~ $freemail_re) <=> ($b =~ $freemail_re));364return $cmp unless $cmp == 0;365366# alphabetical by domain367my ($alocal, $adom) = split /\@/, $a;368my ($blocal, $bdom) = split /\@/, $b;369$cmp = ($adom cmp $bdom);370return $cmp unless $cmp == 0;371372# alphabetical by local part373return ($alocal cmp $blocal);374} @_;375376return shift @emails;377}378379380