// SPDX-License-Identifier: GPL-2.01/*2* Bad block management3*4* - Heavily based on MD badblocks code from Neil Brown5*6* Copyright (c) 2015, Intel Corporation.7*/89#include <linux/badblocks.h>10#include <linux/seqlock.h>11#include <linux/device.h>12#include <linux/kernel.h>13#include <linux/module.h>14#include <linux/stddef.h>15#include <linux/types.h>16#include <linux/slab.h>1718/*19* The purpose of badblocks set/clear is to manage bad blocks ranges which are20* identified by LBA addresses.21*22* When the caller of badblocks_set() wants to set a range of bad blocks, the23* setting range can be acked or unacked. And the setting range may merge,24* overwrite, skip the overlapped already set range, depends on who they are25* overlapped or adjacent, and the acknowledgment type of the ranges. It can be26* more complicated when the setting range covers multiple already set bad block27* ranges, with restrictions of maximum length of each bad range and the bad28* table space limitation.29*30* It is difficult and unnecessary to take care of all the possible situations,31* for setting a large range of bad blocks, we can handle it by dividing the32* large range into smaller ones when encounter overlap, max range length or33* bad table full conditions. Every time only a smaller piece of the bad range34* is handled with a limited number of conditions how it is interacted with35* possible overlapped or adjacent already set bad block ranges. Then the hard36* complicated problem can be much simpler to handle in proper way.37*38* When setting a range of bad blocks to the bad table, the simplified situations39* to be considered are, (The already set bad blocks ranges are naming with40* prefix E, and the setting bad blocks range is naming with prefix S)41*42* 1) A setting range is not overlapped or adjacent to any other already set bad43* block range.44* +--------+45* | S |46* +--------+47* +-------------+ +-------------+48* | E1 | | E2 |49* +-------------+ +-------------+50* For this situation if the bad blocks table is not full, just allocate a51* free slot from the bad blocks table to mark the setting range S. The52* result is,53* +-------------+ +--------+ +-------------+54* | E1 | | S | | E2 |55* +-------------+ +--------+ +-------------+56* 2) A setting range starts exactly at a start LBA of an already set bad blocks57* range.58* 2.1) The setting range size < already set range size59* +--------+60* | S |61* +--------+62* +-------------+63* | E |64* +-------------+65* 2.1.1) If S and E are both acked or unacked range, the setting range S can66* be merged into existing bad range E. The result is,67* +-------------+68* | S |69* +-------------+70* 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and71* the result is,72* +-------------+73* | E |74* +-------------+75* 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.76* An extra slot from the bad blocks table will be allocated for S, and head77* of E will move to end of the inserted range S. The result is,78* +--------+----+79* | S | E |80* +--------+----+81* 2.2) The setting range size == already set range size82* 2.2.1) If S and E are both acked or unacked range, the setting range S can83* be merged into existing bad range E. The result is,84* +-------------+85* | S |86* +-------------+87* 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and88* the result is,89* +-------------+90* | E |91* +-------------+92* 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of93bad blocks range E. The result is,94* +-------------+95* | S |96* +-------------+97* 2.3) The setting range size > already set range size98* +-------------------+99* | S |100* +-------------------+101* +-------------+102* | E |103* +-------------+104* For such situation, the setting range S can be treated as two parts, the105* first part (S1) is as same size as the already set range E, the second106* part (S2) is the rest of setting range.107* +-------------+-----+ +-------------+ +-----+108* | S1 | S2 | | S1 | | S2 |109* +-------------+-----+ ===> +-------------+ +-----+110* +-------------+ +-------------+111* | E | | E |112* +-------------+ +-------------+113* Now we only focus on how to handle the setting range S1 and already set114* range E, which are already explained in 2.2), for the rest S2 it will be115* handled later in next loop.116* 3) A setting range starts before the start LBA of an already set bad blocks117* range.118* +-------------+119* | S |120* +-------------+121* +-------------+122* | E |123* +-------------+124* For this situation, the setting range S can be divided into two parts, the125* first (S1) ends at the start LBA of already set range E, the second part126* (S2) starts exactly at a start LBA of the already set range E.127* +----+---------+ +----+ +---------+128* | S1 | S2 | | S1 | | S2 |129* +----+---------+ ===> +----+ +---------+130* +-------------+ +-------------+131* | E | | E |132* +-------------+ +-------------+133* Now only the first part S1 should be handled in this loop, which is in134* similar condition as 1). The rest part S2 has exact same start LBA address135* of the already set range E, they will be handled in next loop in one of136* situations in 2).137* 4) A setting range starts after the start LBA of an already set bad blocks138* range.139* 4.1) If the setting range S exactly matches the tail part of already set bad140* blocks range E, like the following chart shows,141* +---------+142* | S |143* +---------+144* +-------------+145* | E |146* +-------------+147* 4.1.1) If range S and E have same acknowledge value (both acked or unacked),148* they will be merged into one, the result is,149* +-------------+150* | S |151* +-------------+152* 4.1.2) If range E is acked and the setting range S is unacked, the setting153* request of S will be rejected, the result is,154* +-------------+155* | E |156* +-------------+157* 4.1.3) If range E is unacked, and the setting range S is acked, then S may158* overwrite the overlapped range of E, the result is,159* +---+---------+160* | E | S |161* +---+---------+162* 4.2) If the setting range S stays in middle of an already set range E, like163* the following chart shows,164* +----+165* | S |166* +----+167* +--------------+168* | E |169* +--------------+170* 4.2.1) If range S and E have same acknowledge value (both acked or unacked),171* they will be merged into one, the result is,172* +--------------+173* | S |174* +--------------+175* 4.2.2) If range E is acked and the setting range S is unacked, the setting176* request of S will be rejected, the result is also,177* +--------------+178* | E |179* +--------------+180* 4.2.3) If range E is unacked, and the setting range S is acked, then S will181* inserted into middle of E and split previous range E into two parts (E1182* and E2), the result is,183* +----+----+----+184* | E1 | S | E2 |185* +----+----+----+186* 4.3) If the setting bad blocks range S is overlapped with an already set bad187* blocks range E. The range S starts after the start LBA of range E, and188* ends after the end LBA of range E, as the following chart shows,189* +-------------------+190* | S |191* +-------------------+192* +-------------+193* | E |194* +-------------+195* For this situation the range S can be divided into two parts, the first196* part (S1) ends at end range E, and the second part (S2) has rest range of197* origin S.198* +---------+---------+ +---------+ +---------+199* | S1 | S2 | | S1 | | S2 |200* +---------+---------+ ===> +---------+ +---------+201* +-------------+ +-------------+202* | E | | E |203* +-------------+ +-------------+204* Now in this loop the setting range S1 and already set range E can be205* handled as the situations 4.1), the rest range S2 will be handled in next206* loop and ignored in this loop.207* 5) A setting bad blocks range S is adjacent to one or more already set bad208* blocks range(s), and they are all acked or unacked range.209* 5.1) Front merge: If the already set bad blocks range E is before setting210* range S and they are adjacent,211* +------+212* | S |213* +------+214* +-------+215* | E |216* +-------+217* 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge218* values are same, the setting range S can front merges into range E. The219* result is,220* +--------------+221* | S |222* +--------------+223* 5.1.2) Otherwise these two ranges cannot merge, just insert the setting224* range S right after already set range E into the bad blocks table. The225* result is,226* +--------+------+227* | E | S |228* +--------+------+229* 6) Special cases which above conditions cannot handle230* 6.1) Multiple already set ranges may merge into less ones in a full bad table231* +-------------------------------------------------------+232* | S |233* +-------------------------------------------------------+234* |<----- BB_MAX_LEN ----->|235* +-----+ +-----+ +-----+236* | E1 | | E2 | | E3 |237* +-----+ +-----+ +-----+238* In the above example, when the bad blocks table is full, inserting the239* first part of setting range S will fail because no more available slot240* can be allocated from bad blocks table. In this situation a proper241* setting method should be go though all the setting bad blocks range and242* look for chance to merge already set ranges into less ones. When there243* is available slot from bad blocks table, re-try again to handle more244* setting bad blocks ranges as many as possible.245* +------------------------+246* | S3 |247* +------------------------+248* |<----- BB_MAX_LEN ----->|249* +-----+-----+-----+---+-----+--+250* | S1 | S2 |251* +-----+-----+-----+---+-----+--+252* The above chart shows although the first part (S3) cannot be inserted due253* to no-space in bad blocks table, but the following E1, E2 and E3 ranges254* can be merged with rest part of S into less range S1 and S2. Now there is255* 1 free slot in bad blocks table.256* +------------------------+-----+-----+-----+---+-----+--+257* | S3 | S1 | S2 |258* +------------------------+-----+-----+-----+---+-----+--+259* Since the bad blocks table is not full anymore, re-try again for the260* origin setting range S. Now the setting range S3 can be inserted into the261* bad blocks table with previous freed slot from multiple ranges merge.262* 6.2) Front merge after overwrite263* In the following example, in bad blocks table, E1 is an acked bad blocks264* range and E2 is an unacked bad blocks range, therefore they are not able265* to merge into a larger range. The setting bad blocks range S is acked,266* therefore part of E2 can be overwritten by S.267* +--------+268* | S | acknowledged269* +--------+ S: 1270* +-------+-------------+ E1: 1271* | E1 | E2 | E2: 0272* +-------+-------------+273* With previous simplified routines, after overwriting part of E2 with S,274* the bad blocks table should be (E3 is remaining part of E2 which is not275* overwritten by S),276* acknowledged277* +-------+--------+----+ S: 1278* | E1 | S | E3 | E1: 1279* +-------+--------+----+ E3: 0280* The above result is correct but not perfect. Range E1 and S in the bad281* blocks table are all acked, merging them into a larger one range may282* occupy less bad blocks table space and make badblocks_check() faster.283* Therefore in such situation, after overwriting range S, the previous range284* E1 should be checked for possible front combination. Then the ideal285* result can be,286* +----------------+----+ acknowledged287* | E1 | E3 | E1: 1288* +----------------+----+ E3: 0289* 6.3) Behind merge: If the already set bad blocks range E is behind the setting290* range S and they are adjacent. Normally we don't need to care about this291* because front merge handles this while going though range S from head to292* tail, except for the tail part of range S. When the setting range S are293* fully handled, all the above simplified routine doesn't check whether the294* tail LBA of range S is adjacent to the next already set range and not295* merge them even it is possible.296* +------+297* | S |298* +------+299* +-------+300* | E |301* +-------+302* For the above special situation, when the setting range S are all handled303* and the loop ends, an extra check is necessary for whether next already304* set range E is right after S and mergeable.305* 6.3.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge306* values are same, the setting range S can behind merges into range E. The307* result is,308* +--------------+309* | S |310* +--------------+311* 6.3.2) Otherwise these two ranges cannot merge, just insert the setting range312* S in front of the already set range E in the bad blocks table. The result313* is,314* +------+-------+315* | S | E |316* +------+-------+317*318* All the above 5 simplified situations and 3 special cases may cover 99%+ of319* the bad block range setting conditions. Maybe there is some rare corner case320* is not considered and optimized, it won't hurt if badblocks_set() fails due321* to no space, or some ranges are not merged to save bad blocks table space.322*323* Inside badblocks_set() each loop starts by jumping to re_insert label, every324* time for the new loop prev_badblocks() is called to find an already set range325* which starts before or at current setting range. Since the setting bad blocks326* range is handled from head to tail, most of the cases it is unnecessary to do327* the binary search inside prev_badblocks(), it is possible to provide a hint328* to prev_badblocks() for a fast path, then the expensive binary search can be329* avoided. In my test with the hint to prev_badblocks(), except for the first330* loop, all rested calls to prev_badblocks() can go into the fast path and331* return correct bad blocks table index immediately.332*333*334* Clearing a bad blocks range from the bad block table has similar idea as335* setting does, but much more simpler. The only thing needs to be noticed is336* when the clearing range hits middle of a bad block range, the existing bad337* block range will split into two, and one more item should be added into the338* bad block table. The simplified situations to be considered are, (The already339* set bad blocks ranges in bad block table are naming with prefix E, and the340* clearing bad blocks range is naming with prefix C)341*342* 1) A clearing range is not overlapped to any already set ranges in bad block343* table.344* +-----+ | +-----+ | +-----+345* | C | | | C | | | C |346* +-----+ or +-----+ or +-----+347* +---+ | +----+ +----+ | +---+348* | E | | | E1 | | E2 | | | E |349* +---+ | +----+ +----+ | +---+350* For the above situations, no bad block to be cleared and no failure351* happens, simply returns 0.352* 2) The clearing range hits middle of an already setting bad blocks range in353* the bad block table.354* +---+355* | C |356* +---+357* +-----------------+358* | E |359* +-----------------+360* In this situation if the bad block table is not full, the range E will be361* split into two ranges E1 and E2. The result is,362* +------+ +------+363* | E1 | | E2 |364* +------+ +------+365* 3) The clearing range starts exactly at same LBA as an already set bad block range366* from the bad block table.367* 3.1) Partially covered at head part368* +------------+369* | C |370* +------------+371* +-----------------+372* | E |373* +-----------------+374* For this situation, the overlapped already set range will update the375* start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No376* item deleted from bad block table. The result is,377* +----+378* | E1 |379* +----+380* 3.2) Exact fully covered381* +-----------------+382* | C |383* +-----------------+384* +-----------------+385* | E |386* +-----------------+387* For this situation the whole bad blocks range E will be cleared and its388* corresponded item is deleted from the bad block table.389* 4) The clearing range exactly ends at same LBA as an already set bad block390* range.391* +-------+392* | C |393* +-------+394* +-----------------+395* | E |396* +-----------------+397* For the above situation, the already set range E is updated to shrink its398* end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).399* The result is,400* +---------+401* | E |402* +---------+403* 5) The clearing range is partially overlapped with an already set bad block404* range from the bad block table.405* 5.1) The already set bad block range is front overlapped with the clearing406* range.407* +----------+408* | C |409* +----------+410* +------------+411* | E |412* +------------+413* For such situation, the clearing range C can be treated as two parts. The414* first part ends at the start LBA of range E, and the second part starts at415* same LBA of range E.416* +----+-----+ +----+ +-----+417* | C1 | C2 | | C1 | | C2 |418* +----+-----+ ===> +----+ +-----+419* +------------+ +------------+420* | E | | E |421* +------------+ +------------+422* Now the first part C1 can be handled as condition 1), and the second part C2 can be423* handled as condition 3.1) in next loop.424* 5.2) The already set bad block range is behind overlaopped with the clearing425* range.426* +----------+427* | C |428* +----------+429* +------------+430* | E |431* +------------+432* For such situation, the clearing range C can be treated as two parts. The433* first part C1 ends at same end LBA of range E, and the second part starts434* at end LBA of range E.435* +----+-----+ +----+ +-----+436* | C1 | C2 | | C1 | | C2 |437* +----+-----+ ===> +----+ +-----+438* +------------+ +------------+439* | E | | E |440* +------------+ +------------+441* Now the first part clearing range C1 can be handled as condition 4), and442* the second part clearing range C2 can be handled as condition 1) in next443* loop.444*445* All bad blocks range clearing can be simplified into the above 5 situations446* by only handling the head part of the clearing range in each run of the447* while-loop. The idea is similar to bad blocks range setting but much448* simpler.449*/450451/*452* Find the range starts at-or-before 's' from bad table. The search453* starts from index 'hint' and stops at index 'hint_end' from the bad454* table.455*/456static int prev_by_hint(struct badblocks *bb, sector_t s, int hint)457{458int hint_end = hint + 2;459u64 *p = bb->page;460int ret = -1;461462while ((hint < hint_end) && ((hint + 1) <= bb->count) &&463(BB_OFFSET(p[hint]) <= s)) {464if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) {465ret = hint;466break;467}468hint++;469}470471return ret;472}473474/*475* Find the range starts at-or-before bad->start. If 'hint' is provided476* (hint >= 0) then search in the bad table from hint firstly. It is477* very probably the wanted bad range can be found from the hint index,478* then the unnecessary while-loop iteration can be avoided.479*/480static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad,481int hint)482{483sector_t s = bad->start;484int ret = -1;485int lo, hi;486u64 *p;487488if (!bb->count)489goto out;490491if (hint >= 0) {492ret = prev_by_hint(bb, s, hint);493if (ret >= 0)494goto out;495}496497lo = 0;498hi = bb->count;499p = bb->page;500501/* The following bisect search might be unnecessary */502if (BB_OFFSET(p[lo]) > s)503return -1;504if (BB_OFFSET(p[hi - 1]) <= s)505return hi - 1;506507/* Do bisect search in bad table */508while (hi - lo > 1) {509int mid = (lo + hi)/2;510sector_t a = BB_OFFSET(p[mid]);511512if (a == s) {513ret = mid;514goto out;515}516517if (a < s)518lo = mid;519else520hi = mid;521}522523if (BB_OFFSET(p[lo]) <= s)524ret = lo;525out:526return ret;527}528529/*530* Return 'true' if the range indicated by 'bad' can be forward531* merged with the bad range (from the bad table) indexed by 'prev'.532*/533static bool can_merge_front(struct badblocks *bb, int prev,534struct badblocks_context *bad)535{536sector_t s = bad->start;537u64 *p = bb->page;538539if (BB_ACK(p[prev]) == bad->ack &&540(s < BB_END(p[prev]) ||541(s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN))))542return true;543return false;544}545546/*547* Do forward merge for range indicated by 'bad' and the bad range548* (from bad table) indexed by 'prev'. The return value is sectors549* merged from bad->len.550*/551static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad)552{553sector_t sectors = bad->len;554sector_t s = bad->start;555u64 *p = bb->page;556int merged = 0;557558WARN_ON(s > BB_END(p[prev]));559560if (s < BB_END(p[prev])) {561merged = min_t(sector_t, sectors, BB_END(p[prev]) - s);562} else {563merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev]));564if ((prev + 1) < bb->count &&565merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) {566merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]);567}568569p[prev] = BB_MAKE(BB_OFFSET(p[prev]),570BB_LEN(p[prev]) + merged, bad->ack);571}572573return merged;574}575576/*577* 'Combine' is a special case which can_merge_front() is not able to578* handle: If a bad range (indexed by 'prev' from bad table) exactly579* starts as bad->start, and the bad range ahead of 'prev' (indexed by580* 'prev - 1' from bad table) exactly ends at where 'prev' starts, and581* the sum of their lengths does not exceed BB_MAX_LEN limitation, then582* these two bad range (from bad table) can be combined.583*584* Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad585* table can be combined.586*/587static bool can_combine_front(struct badblocks *bb, int prev,588struct badblocks_context *bad)589{590u64 *p = bb->page;591592if ((prev > 0) &&593(BB_OFFSET(p[prev]) == bad->start) &&594(BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) &&595(BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) &&596(BB_ACK(p[prev - 1]) == BB_ACK(p[prev])))597return true;598return false;599}600601/*602* Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad603* table) into one larger bad range, and the new range is indexed by604* 'prev - 1'.605* The caller of front_combine() will decrease bb->count, therefore606* it is unnecessary to clear p[perv] after front merge.607*/608static void front_combine(struct badblocks *bb, int prev)609{610u64 *p = bb->page;611612p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]),613BB_LEN(p[prev - 1]) + BB_LEN(p[prev]),614BB_ACK(p[prev]));615if ((prev + 1) < bb->count)616memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8);617}618619/*620* Return 'true' if the range indicated by 'bad' is exactly forward621* overlapped with the bad range (from bad table) indexed by 'front'.622* Exactly forward overlap means the bad range (from bad table) indexed623* by 'prev' does not cover the whole range indicated by 'bad'.624*/625static bool overlap_front(struct badblocks *bb, int front,626struct badblocks_context *bad)627{628u64 *p = bb->page;629630if (bad->start >= BB_OFFSET(p[front]) &&631bad->start < BB_END(p[front]))632return true;633return false;634}635636/*637* Return 'true' if the range indicated by 'bad' is exactly backward638* overlapped with the bad range (from bad table) indexed by 'behind'.639*/640static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad,641int behind)642{643u64 *p = bb->page;644645if (bad->start < BB_OFFSET(p[behind]) &&646(bad->start + bad->len) > BB_OFFSET(p[behind]))647return true;648return false;649}650651/*652* Return 'true' if the range indicated by 'bad' can overwrite the bad653* range (from bad table) indexed by 'prev'.654*655* The range indicated by 'bad' can overwrite the bad range indexed by656* 'prev' when,657* 1) The whole range indicated by 'bad' can cover partial or whole bad658* range (from bad table) indexed by 'prev'.659* 2) The ack value of 'bad' is larger or equal to the ack value of bad660* range 'prev'.661*662* If the overwriting doesn't cover the whole bad range (from bad table)663* indexed by 'prev', new range might be split from existing bad range,664* 1) The overwrite covers head or tail part of existing bad range, 1665* extra bad range will be split and added into the bad table.666* 2) The overwrite covers middle of existing bad range, 2 extra bad667* ranges will be split (ahead and after the overwritten range) and668* added into the bad table.669* The number of extra split ranges of the overwriting is stored in670* 'extra' and returned for the caller.671*/672static bool can_front_overwrite(struct badblocks *bb, int prev,673struct badblocks_context *bad, int *extra)674{675u64 *p = bb->page;676int len;677678WARN_ON(!overlap_front(bb, prev, bad));679680if (BB_ACK(p[prev]) >= bad->ack)681return false;682683if (BB_END(p[prev]) <= (bad->start + bad->len)) {684len = BB_END(p[prev]) - bad->start;685if (BB_OFFSET(p[prev]) == bad->start)686*extra = 0;687else688*extra = 1;689690bad->len = len;691} else {692if (BB_OFFSET(p[prev]) == bad->start)693*extra = 1;694else695/*696* prev range will be split into two, beside the overwritten697* one, an extra slot needed from bad table.698*/699*extra = 2;700}701702if ((bb->count + (*extra)) > MAX_BADBLOCKS)703return false;704705return true;706}707708/*709* Do the overwrite from the range indicated by 'bad' to the bad range710* (from bad table) indexed by 'prev'.711* The previously called can_front_overwrite() will provide how many712* extra bad range(s) might be split and added into the bad table. All713* the splitting cases in the bad table will be handled here.714*/715static int front_overwrite(struct badblocks *bb, int prev,716struct badblocks_context *bad, int extra)717{718u64 *p = bb->page;719sector_t orig_end = BB_END(p[prev]);720int orig_ack = BB_ACK(p[prev]);721722switch (extra) {723case 0:724p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]),725bad->ack);726break;727case 1:728if (BB_OFFSET(p[prev]) == bad->start) {729p[prev] = BB_MAKE(BB_OFFSET(p[prev]),730bad->len, bad->ack);731memmove(p + prev + 2, p + prev + 1,732(bb->count - prev - 1) * 8);733p[prev + 1] = BB_MAKE(bad->start + bad->len,734orig_end - BB_END(p[prev]),735orig_ack);736} else {737p[prev] = BB_MAKE(BB_OFFSET(p[prev]),738bad->start - BB_OFFSET(p[prev]),739orig_ack);740/*741* prev +2 -> prev + 1 + 1, which is for,742* 1) prev + 1: the slot index of the previous one743* 2) + 1: one more slot for extra being 1.744*/745memmove(p + prev + 2, p + prev + 1,746(bb->count - prev - 1) * 8);747p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);748}749break;750case 2:751p[prev] = BB_MAKE(BB_OFFSET(p[prev]),752bad->start - BB_OFFSET(p[prev]),753orig_ack);754/*755* prev + 3 -> prev + 1 + 2, which is for,756* 1) prev + 1: the slot index of the previous one757* 2) + 2: two more slots for extra being 2.758*/759memmove(p + prev + 3, p + prev + 1,760(bb->count - prev - 1) * 8);761p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);762p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]),763orig_end - BB_END(p[prev + 1]),764orig_ack);765break;766default:767break;768}769770return bad->len;771}772773/*774* Explicitly insert a range indicated by 'bad' to the bad table, where775* the location is indexed by 'at'.776*/777static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad)778{779u64 *p = bb->page;780int len;781782WARN_ON(badblocks_full(bb));783784len = min_t(sector_t, bad->len, BB_MAX_LEN);785if (at < bb->count)786memmove(p + at + 1, p + at, (bb->count - at) * 8);787p[at] = BB_MAKE(bad->start, len, bad->ack);788789return len;790}791792static void badblocks_update_acked(struct badblocks *bb)793{794bool unacked = false;795u64 *p = bb->page;796int i;797798if (!bb->unacked_exist)799return;800801for (i = 0; i < bb->count ; i++) {802if (!BB_ACK(p[i])) {803unacked = true;804break;805}806}807808if (!unacked)809bb->unacked_exist = 0;810}811812/*813* Return 'true' if the range indicated by 'bad' is exactly backward814* overlapped with the bad range (from bad table) indexed by 'behind'.815*/816static bool try_adjacent_combine(struct badblocks *bb, int prev)817{818u64 *p = bb->page;819820if (prev >= 0 && (prev + 1) < bb->count &&821BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&822(BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&823BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {824p[prev] = BB_MAKE(BB_OFFSET(p[prev]),825BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),826BB_ACK(p[prev]));827828if ((prev + 2) < bb->count)829memmove(p + prev + 1, p + prev + 2,830(bb->count - (prev + 2)) * 8);831bb->count--;832return true;833}834return false;835}836837/* Do exact work to set bad block range into the bad block table */838static bool _badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors,839int acknowledged)840{841int len = 0, added = 0;842struct badblocks_context bad;843int prev = -1, hint = -1;844unsigned long flags;845u64 *p;846847if (bb->shift < 0)848/* badblocks are disabled */849return false;850851if (sectors == 0)852/* Invalid sectors number */853return false;854855if (bb->shift) {856/* round the start down, and the end up */857sector_t next = s + sectors;858859rounddown(s, 1 << bb->shift);860roundup(next, 1 << bb->shift);861sectors = next - s;862}863864write_seqlock_irqsave(&bb->lock, flags);865866bad.ack = acknowledged;867p = bb->page;868869re_insert:870bad.start = s;871bad.len = sectors;872len = 0;873874if (badblocks_full(bb))875goto out;876877if (badblocks_empty(bb)) {878len = insert_at(bb, 0, &bad);879bb->count++;880added++;881goto update_sectors;882}883884prev = prev_badblocks(bb, &bad, hint);885886/* start before all badblocks */887if (prev < 0) {888/* insert on the first */889if (bad.len > (BB_OFFSET(p[0]) - bad.start))890bad.len = BB_OFFSET(p[0]) - bad.start;891len = insert_at(bb, 0, &bad);892bb->count++;893added++;894hint = ++prev;895goto update_sectors;896}897898/* in case p[prev-1] can be merged with p[prev] */899if (can_combine_front(bb, prev, &bad)) {900front_combine(bb, prev);901bb->count--;902added++;903hint = prev;904goto update_sectors;905}906907if (can_merge_front(bb, prev, &bad)) {908len = front_merge(bb, prev, &bad);909added++;910hint = prev;911goto update_sectors;912}913914if (overlap_front(bb, prev, &bad)) {915int extra = 0;916917if (!can_front_overwrite(bb, prev, &bad, &extra)) {918if (extra > 0)919goto out;920921len = min_t(sector_t,922BB_END(p[prev]) - s, sectors);923hint = prev;924goto update_sectors;925}926927len = front_overwrite(bb, prev, &bad, extra);928added++;929bb->count += extra;930931if (can_combine_front(bb, prev, &bad)) {932front_combine(bb, prev);933bb->count--;934}935936hint = prev;937goto update_sectors;938}939940/* cannot merge and there is space in bad table */941if ((prev + 1) < bb->count &&942overlap_behind(bb, &bad, prev + 1))943bad.len = min_t(sector_t,944bad.len, BB_OFFSET(p[prev + 1]) - bad.start);945946len = insert_at(bb, prev + 1, &bad);947bb->count++;948added++;949hint = ++prev;950951update_sectors:952s += len;953sectors -= len;954955if (sectors > 0)956goto re_insert;957958/*959* Check whether the following already set range can be960* merged. (prev < 0) condition is not handled here,961* because it's already complicated enough.962*/963try_adjacent_combine(bb, prev);964965out:966if (added) {967set_changed(bb);968969if (!acknowledged)970bb->unacked_exist = 1;971else972badblocks_update_acked(bb);973}974975write_sequnlock_irqrestore(&bb->lock, flags);976977return sectors == 0;978}979980/*981* Clear the bad block range from bad block table which is front overlapped982* with the clearing range. The return value is how many sectors from an983* already set bad block range are cleared. If the whole bad block range is984* covered by the clearing range and fully cleared, 'delete' is set as 1 for985* the caller to reduce bb->count.986*/987static int front_clear(struct badblocks *bb, int prev,988struct badblocks_context *bad, int *deleted)989{990sector_t sectors = bad->len;991sector_t s = bad->start;992u64 *p = bb->page;993int cleared = 0;994995*deleted = 0;996if (s == BB_OFFSET(p[prev])) {997if (BB_LEN(p[prev]) > sectors) {998p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors,999BB_LEN(p[prev]) - sectors,1000BB_ACK(p[prev]));1001cleared = sectors;1002} else {1003/* BB_LEN(p[prev]) <= sectors */1004cleared = BB_LEN(p[prev]);1005if ((prev + 1) < bb->count)1006memmove(p + prev, p + prev + 1,1007(bb->count - prev - 1) * 8);1008*deleted = 1;1009}1010} else if (s > BB_OFFSET(p[prev])) {1011if (BB_END(p[prev]) <= (s + sectors)) {1012cleared = BB_END(p[prev]) - s;1013p[prev] = BB_MAKE(BB_OFFSET(p[prev]),1014s - BB_OFFSET(p[prev]),1015BB_ACK(p[prev]));1016} else {1017/* Splitting is handled in front_splitting_clear() */1018BUG();1019}1020}10211022return cleared;1023}10241025/*1026* Handle the condition that the clearing range hits middle of an already set1027* bad block range from bad block table. In this condition the existing bad1028* block range is split into two after the middle part is cleared.1029*/1030static int front_splitting_clear(struct badblocks *bb, int prev,1031struct badblocks_context *bad)1032{1033u64 *p = bb->page;1034u64 end = BB_END(p[prev]);1035int ack = BB_ACK(p[prev]);1036sector_t sectors = bad->len;1037sector_t s = bad->start;10381039p[prev] = BB_MAKE(BB_OFFSET(p[prev]),1040s - BB_OFFSET(p[prev]),1041ack);1042memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8);1043p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack);1044return sectors;1045}10461047/* Do the exact work to clear bad block range from the bad block table */1048static bool _badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors)1049{1050struct badblocks_context bad;1051int prev = -1, hint = -1;1052int len = 0, cleared = 0;1053u64 *p;10541055if (bb->shift < 0)1056/* badblocks are disabled */1057return false;10581059if (sectors == 0)1060/* Invalid sectors number */1061return false;10621063if (bb->shift) {1064sector_t target;10651066/* When clearing we round the start up and the end down.1067* This should not matter as the shift should align with1068* the block size and no rounding should ever be needed.1069* However it is better the think a block is bad when it1070* isn't than to think a block is not bad when it is.1071*/1072target = s + sectors;1073roundup(s, 1 << bb->shift);1074rounddown(target, 1 << bb->shift);1075sectors = target - s;1076}10771078write_seqlock_irq(&bb->lock);10791080bad.ack = true;1081p = bb->page;10821083re_clear:1084bad.start = s;1085bad.len = sectors;10861087if (badblocks_empty(bb)) {1088len = sectors;1089cleared++;1090goto update_sectors;1091}109210931094prev = prev_badblocks(bb, &bad, hint);10951096/* Start before all badblocks */1097if (prev < 0) {1098if (overlap_behind(bb, &bad, 0)) {1099len = BB_OFFSET(p[0]) - s;1100hint = 0;1101} else {1102len = sectors;1103}1104/*1105* Both situations are to clear non-bad range,1106* should be treated as successful1107*/1108cleared++;1109goto update_sectors;1110}11111112/* Start after all badblocks */1113if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {1114len = sectors;1115cleared++;1116goto update_sectors;1117}11181119/* Clear will split a bad record but the table is full */1120if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) &&1121(BB_END(p[prev]) > (bad.start + sectors))) {1122len = sectors;1123goto update_sectors;1124}11251126if (overlap_front(bb, prev, &bad)) {1127if ((BB_OFFSET(p[prev]) < bad.start) &&1128(BB_END(p[prev]) > (bad.start + bad.len))) {1129/* Splitting */1130if ((bb->count + 1) <= MAX_BADBLOCKS) {1131len = front_splitting_clear(bb, prev, &bad);1132bb->count += 1;1133cleared++;1134} else {1135/* No space to split, give up */1136len = sectors;1137}1138} else {1139int deleted = 0;11401141len = front_clear(bb, prev, &bad, &deleted);1142bb->count -= deleted;1143cleared++;1144hint = prev;1145}11461147goto update_sectors;1148}11491150/* Not front overlap, but behind overlap */1151if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {1152len = BB_OFFSET(p[prev + 1]) - bad.start;1153hint = prev + 1;1154/* Clear non-bad range should be treated as successful */1155cleared++;1156goto update_sectors;1157}11581159/* Not cover any badblocks range in the table */1160len = sectors;1161/* Clear non-bad range should be treated as successful */1162cleared++;11631164update_sectors:1165s += len;1166sectors -= len;11671168if (sectors > 0)1169goto re_clear;11701171if (cleared) {1172badblocks_update_acked(bb);1173set_changed(bb);1174}11751176write_sequnlock_irq(&bb->lock);11771178if (!cleared)1179return false;11801181return true;1182}11831184/* Do the exact work to check bad blocks range from the bad block table */1185static int _badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors,1186sector_t *first_bad, sector_t *bad_sectors)1187{1188int prev = -1, hint = -1, set = 0;1189struct badblocks_context bad;1190int unacked_badblocks = 0;1191int acked_badblocks = 0;1192u64 *p = bb->page;1193int len, rv;11941195re_check:1196bad.start = s;1197bad.len = sectors;11981199if (badblocks_empty(bb)) {1200len = sectors;1201goto update_sectors;1202}12031204prev = prev_badblocks(bb, &bad, hint);12051206/* start after all badblocks */1207if ((prev >= 0) &&1208((prev + 1) >= bb->count) && !overlap_front(bb, prev, &bad)) {1209len = sectors;1210goto update_sectors;1211}12121213/* Overlapped with front badblocks record */1214if ((prev >= 0) && overlap_front(bb, prev, &bad)) {1215if (BB_ACK(p[prev]))1216acked_badblocks++;1217else1218unacked_badblocks++;12191220if (BB_END(p[prev]) >= (s + sectors))1221len = sectors;1222else1223len = BB_END(p[prev]) - s;12241225if (set == 0) {1226*first_bad = BB_OFFSET(p[prev]);1227*bad_sectors = BB_LEN(p[prev]);1228set = 1;1229}1230goto update_sectors;1231}12321233/* Not front overlap, but behind overlap */1234if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {1235len = BB_OFFSET(p[prev + 1]) - bad.start;1236hint = prev + 1;1237goto update_sectors;1238}12391240/* not cover any badblocks range in the table */1241len = sectors;12421243update_sectors:1244/* This situation should never happen */1245WARN_ON(sectors < len);12461247s += len;1248sectors -= len;12491250if (sectors > 0)1251goto re_check;12521253if (unacked_badblocks > 0)1254rv = -1;1255else if (acked_badblocks > 0)1256rv = 1;1257else1258rv = 0;12591260return rv;1261}12621263/**1264* badblocks_check() - check a given range for bad sectors1265* @bb: the badblocks structure that holds all badblock information1266* @s: sector (start) at which to check for badblocks1267* @sectors: number of sectors to check for badblocks1268* @first_bad: pointer to store location of the first badblock1269* @bad_sectors: pointer to store number of badblocks after @first_bad1270*1271* We can record which blocks on each device are 'bad' and so just1272* fail those blocks, or that stripe, rather than the whole device.1273* Entries in the bad-block table are 64bits wide. This comprises:1274* Length of bad-range, in sectors: 0-511 for lengths 1-5121275* Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)1276* A 'shift' can be set so that larger blocks are tracked and1277* consequently larger devices can be covered.1278* 'Acknowledged' flag - 1 bit. - the most significant bit.1279*1280* Locking of the bad-block table uses a seqlock so badblocks_check1281* might need to retry if it is very unlucky.1282* We will sometimes want to check for bad blocks in a bi_end_io function,1283* so we use the write_seqlock_irq variant.1284*1285* When looking for a bad block we specify a range and want to1286* know if any block in the range is bad. So we binary-search1287* to the last range that starts at-or-before the given endpoint,1288* (or "before the sector after the target range")1289* then see if it ends after the given start.1290*1291* Return:1292* 0: there are no known bad blocks in the range1293* 1: there are known bad block which are all acknowledged1294* -1: there are bad blocks which have not yet been acknowledged in metadata.1295* plus the start/length of the first bad section we overlap.1296*/1297int badblocks_check(struct badblocks *bb, sector_t s, sector_t sectors,1298sector_t *first_bad, sector_t *bad_sectors)1299{1300unsigned int seq;1301int rv;13021303WARN_ON(bb->shift < 0 || sectors == 0);13041305if (bb->shift > 0) {1306/* round the start down, and the end up */1307sector_t target = s + sectors;13081309rounddown(s, 1 << bb->shift);1310roundup(target, 1 << bb->shift);1311sectors = target - s;1312}13131314retry:1315seq = read_seqbegin(&bb->lock);1316rv = _badblocks_check(bb, s, sectors, first_bad, bad_sectors);1317if (read_seqretry(&bb->lock, seq))1318goto retry;13191320return rv;1321}1322EXPORT_SYMBOL_GPL(badblocks_check);13231324/**1325* badblocks_set() - Add a range of bad blocks to the table.1326* @bb: the badblocks structure that holds all badblock information1327* @s: first sector to mark as bad1328* @sectors: number of sectors to mark as bad1329* @acknowledged: weather to mark the bad sectors as acknowledged1330*1331* This might extend the table, or might contract it if two adjacent ranges1332* can be merged. We binary-search to find the 'insertion' point, then1333* decide how best to handle it.1334*1335* Return:1336* true: success1337* false: failed to set badblocks (out of space). Parital setting will be1338* treated as failure.1339*/1340bool badblocks_set(struct badblocks *bb, sector_t s, sector_t sectors,1341int acknowledged)1342{1343return _badblocks_set(bb, s, sectors, acknowledged);1344}1345EXPORT_SYMBOL_GPL(badblocks_set);13461347/**1348* badblocks_clear() - Remove a range of bad blocks to the table.1349* @bb: the badblocks structure that holds all badblock information1350* @s: first sector to mark as bad1351* @sectors: number of sectors to mark as bad1352*1353* This may involve extending the table if we spilt a region,1354* but it must not fail. So if the table becomes full, we just1355* drop the remove request.1356*1357* Return:1358* true: success1359* false: failed to clear badblocks1360*/1361bool badblocks_clear(struct badblocks *bb, sector_t s, sector_t sectors)1362{1363return _badblocks_clear(bb, s, sectors);1364}1365EXPORT_SYMBOL_GPL(badblocks_clear);13661367/**1368* ack_all_badblocks() - Acknowledge all bad blocks in a list.1369* @bb: the badblocks structure that holds all badblock information1370*1371* This only succeeds if ->changed is clear. It is used by1372* in-kernel metadata updates1373*/1374void ack_all_badblocks(struct badblocks *bb)1375{1376if (bb->page == NULL || bb->changed)1377/* no point even trying */1378return;1379write_seqlock_irq(&bb->lock);13801381if (bb->changed == 0 && bb->unacked_exist) {1382u64 *p = bb->page;1383int i;13841385for (i = 0; i < bb->count ; i++) {1386if (!BB_ACK(p[i])) {1387sector_t start = BB_OFFSET(p[i]);1388int len = BB_LEN(p[i]);13891390p[i] = BB_MAKE(start, len, 1);1391}1392}13931394for (i = 0; i < bb->count ; i++)1395while (try_adjacent_combine(bb, i))1396;13971398bb->unacked_exist = 0;1399}1400write_sequnlock_irq(&bb->lock);1401}1402EXPORT_SYMBOL_GPL(ack_all_badblocks);14031404/**1405* badblocks_show() - sysfs access to bad-blocks list1406* @bb: the badblocks structure that holds all badblock information1407* @page: buffer received from sysfs1408* @unack: weather to show unacknowledged badblocks1409*1410* Return:1411* Length of returned data1412*/1413ssize_t badblocks_show(struct badblocks *bb, char *page, int unack)1414{1415size_t len;1416int i;1417u64 *p = bb->page;1418unsigned seq;14191420if (bb->shift < 0)1421return 0;14221423retry:1424seq = read_seqbegin(&bb->lock);14251426len = 0;1427i = 0;14281429while (len < PAGE_SIZE && i < bb->count) {1430sector_t s = BB_OFFSET(p[i]);1431unsigned int length = BB_LEN(p[i]);1432int ack = BB_ACK(p[i]);14331434i++;14351436if (unack && ack)1437continue;14381439len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",1440(unsigned long long)s << bb->shift,1441length << bb->shift);1442}1443if (unack && len == 0)1444bb->unacked_exist = 0;14451446if (read_seqretry(&bb->lock, seq))1447goto retry;14481449return len;1450}1451EXPORT_SYMBOL_GPL(badblocks_show);14521453/**1454* badblocks_store() - sysfs access to bad-blocks list1455* @bb: the badblocks structure that holds all badblock information1456* @page: buffer received from sysfs1457* @len: length of data received from sysfs1458* @unack: weather to show unacknowledged badblocks1459*1460* Return:1461* Length of the buffer processed or -ve error.1462*/1463ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,1464int unack)1465{1466unsigned long long sector;1467int length;1468char newline;14691470switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) {1471case 3:1472if (newline != '\n')1473return -EINVAL;1474fallthrough;1475case 2:1476if (length <= 0)1477return -EINVAL;1478break;1479default:1480return -EINVAL;1481}14821483if (!badblocks_set(bb, sector, length, !unack))1484return -ENOSPC;14851486return len;1487}1488EXPORT_SYMBOL_GPL(badblocks_store);14891490static int __badblocks_init(struct device *dev, struct badblocks *bb,1491int enable)1492{1493bb->dev = dev;1494bb->count = 0;1495if (enable)1496bb->shift = 0;1497else1498bb->shift = -1;1499if (dev)1500bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);1501else1502bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);1503if (!bb->page) {1504bb->shift = -1;1505return -ENOMEM;1506}1507seqlock_init(&bb->lock);15081509return 0;1510}15111512/**1513* badblocks_init() - initialize the badblocks structure1514* @bb: the badblocks structure that holds all badblock information1515* @enable: weather to enable badblocks accounting1516*1517* Return:1518* 0: success1519* -ve errno: on error1520*/1521int badblocks_init(struct badblocks *bb, int enable)1522{1523return __badblocks_init(NULL, bb, enable);1524}1525EXPORT_SYMBOL_GPL(badblocks_init);15261527int devm_init_badblocks(struct device *dev, struct badblocks *bb)1528{1529if (!bb)1530return -EINVAL;1531return __badblocks_init(dev, bb, 1);1532}1533EXPORT_SYMBOL_GPL(devm_init_badblocks);15341535/**1536* badblocks_exit() - free the badblocks structure1537* @bb: the badblocks structure that holds all badblock information1538*/1539void badblocks_exit(struct badblocks *bb)1540{1541if (!bb)1542return;1543if (bb->dev)1544devm_kfree(bb->dev, bb->page);1545else1546kfree(bb->page);1547bb->page = NULL;1548}1549EXPORT_SYMBOL_GPL(badblocks_exit);155015511552