generic_ops-inl.h 264 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019
  1. // Copyright 2021 Google LLC
  2. // Copyright 2023,2024 Arm Limited and/or
  3. // its affiliates <open-source-office@arm.com>
  4. // SPDX-License-Identifier: Apache-2.0
  5. // SPDX-License-Identifier: BSD-3-Clause
  6. //
  7. // Licensed under the Apache License, Version 2.0 (the "License");
  8. // you may not use this file except in compliance with the License.
  9. // You may obtain a copy of the License at
  10. //
  11. // http://www.apache.org/licenses/LICENSE-2.0
  12. //
  13. // Unless required by applicable law or agreed to in writing, software
  14. // distributed under the License is distributed on an "AS IS" BASIS,
  15. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. // See the License for the specific language governing permissions and
  17. // limitations under the License.
  18. // Target-independent types/functions defined after target-specific ops.
  19. // The "include guards" in this file that check HWY_TARGET_TOGGLE serve to skip
  20. // the generic implementation here if native ops are already defined.
  21. #include "hwy/base.h"
  22. // Define detail::Shuffle1230 etc, but only when viewing the current header;
  23. // normally this is included via highway.h, which includes ops/*.h.
  24. #if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED)
  25. #include "hwy/detect_targets.h"
  26. #include "hwy/ops/emu128-inl.h"
  27. #endif // HWY_IDE
  28. // Relies on the external include guard in highway.h.
  29. HWY_BEFORE_NAMESPACE();
  30. namespace hwy {
  31. namespace HWY_NAMESPACE {
  32. // The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>.
  33. template <class V>
  34. using LaneType = decltype(GetLane(V()));
  35. // Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return
  36. // type of functions that do not take a vector argument, or as an argument type
  37. // if the function only has a template argument for D, or for explicit type
  38. // names instead of auto. This may be a built-in type.
  39. template <class D>
  40. using Vec = decltype(Zero(D()));
  41. // Mask type. Useful as the return type of functions that do not take a mask
  42. // argument, or as an argument type if the function only has a template argument
  43. // for D, or for explicit type names instead of auto.
  44. template <class D>
  45. using Mask = decltype(MaskFromVec(Zero(D())));
  46. // Returns the closest value to v within [lo, hi].
  47. template <class V>
  48. HWY_API V Clamp(const V v, const V lo, const V hi) {
  49. return Min(Max(lo, v), hi);
  50. }
  51. // CombineShiftRightBytes (and -Lanes) are not available for the scalar target,
  52. // and RVV has its own implementation of -Lanes.
  53. #if (HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV) || HWY_IDE
  54. template <size_t kLanes, class D>
  55. HWY_API VFromD<D> CombineShiftRightLanes(D d, VFromD<D> hi, VFromD<D> lo) {
  56. constexpr size_t kBytes = kLanes * sizeof(TFromD<D>);
  57. static_assert(kBytes < 16, "Shift count is per-block");
  58. return CombineShiftRightBytes<kBytes>(d, hi, lo);
  59. }
  60. #endif
  61. // Returns lanes with the most significant bit set and all other bits zero.
  62. template <class D>
  63. HWY_API Vec<D> SignBit(D d) {
  64. const RebindToUnsigned<decltype(d)> du;
  65. return BitCast(d, Set(du, SignMask<TFromD<D>>()));
  66. }
  67. // Returns quiet NaN.
  68. template <class D>
  69. HWY_API Vec<D> NaN(D d) {
  70. const RebindToSigned<D> di;
  71. // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
  72. // mantissa MSB (to indicate quiet) would be sufficient.
  73. return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
  74. }
  75. // Returns positive infinity.
  76. template <class D>
  77. HWY_API Vec<D> Inf(D d) {
  78. const RebindToUnsigned<D> du;
  79. using T = TFromD<D>;
  80. using TU = TFromD<decltype(du)>;
  81. const TU max_x2 = static_cast<TU>(MaxExponentTimes2<T>());
  82. return BitCast(d, Set(du, max_x2 >> 1));
  83. }
  84. // ------------------------------ ZeroExtendResizeBitCast
  85. // The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128
  86. // target is in emu128-inl.h, and the implementation of
  87. // detail::ZeroExtendResizeBitCast for the HWY_SCALAR target is in scalar-inl.h
  88. #if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR
  89. namespace detail {
  90. #if HWY_HAVE_SCALABLE
  91. template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom>
  92. HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
  93. hwy::SizeTag<kFromVectSize> /* from_size_tag */,
  94. hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from,
  95. VFromD<DFrom> v) {
  96. const Repartition<uint8_t, DTo> d_to_u8;
  97. const auto resized = ResizeBitCast(d_to_u8, v);
  98. // Zero the upper bytes which were not present/valid in d_from.
  99. const size_t num_bytes = Lanes(Repartition<uint8_t, decltype(d_from)>());
  100. return BitCast(d_to, IfThenElseZero(FirstN(d_to_u8, num_bytes), resized));
  101. }
  102. #else // target that uses fixed-size vectors
  103. // Truncating or same-size resizing cast: same as ResizeBitCast
  104. template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
  105. HWY_IF_LANES_LE(kToVectSize, kFromVectSize)>
  106. HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
  107. hwy::SizeTag<kFromVectSize> /* from_size_tag */,
  108. hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/,
  109. VFromD<DFrom> v) {
  110. return ResizeBitCast(d_to, v);
  111. }
  112. // Resizing cast to vector that has twice the number of lanes of the source
  113. // vector
  114. template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
  115. HWY_IF_LANES(kToVectSize, kFromVectSize * 2)>
  116. HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
  117. hwy::SizeTag<kFromVectSize> /* from_size_tag */,
  118. hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom d_from,
  119. VFromD<DFrom> v) {
  120. const Twice<decltype(d_from)> dt_from;
  121. return BitCast(d_to, ZeroExtendVector(dt_from, v));
  122. }
  123. // Resizing cast to vector that has more than twice the number of lanes of the
  124. // source vector
  125. template <size_t kFromVectSize, size_t kToVectSize, class DTo, class DFrom,
  126. HWY_IF_LANES_GT(kToVectSize, kFromVectSize * 2)>
  127. HWY_INLINE VFromD<DTo> ZeroExtendResizeBitCast(
  128. hwy::SizeTag<kFromVectSize> /* from_size_tag */,
  129. hwy::SizeTag<kToVectSize> /* to_size_tag */, DTo d_to, DFrom /*d_from*/,
  130. VFromD<DFrom> v) {
  131. using TFrom = TFromD<DFrom>;
  132. constexpr size_t kNumOfFromLanes = kFromVectSize / sizeof(TFrom);
  133. const Repartition<TFrom, decltype(d_to)> d_resize_to;
  134. return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, kNumOfFromLanes),
  135. ResizeBitCast(d_resize_to, v)));
  136. }
  137. #endif // HWY_HAVE_SCALABLE
  138. } // namespace detail
  139. #endif // HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR
  140. template <class DTo, class DFrom>
  141. HWY_API VFromD<DTo> ZeroExtendResizeBitCast(DTo d_to, DFrom d_from,
  142. VFromD<DFrom> v) {
  143. return detail::ZeroExtendResizeBitCast(hwy::SizeTag<d_from.MaxBytes()>(),
  144. hwy::SizeTag<d_to.MaxBytes()>(), d_to,
  145. d_from, v);
  146. }
  147. // ------------------------------ SafeFillN
  148. template <class D, typename T = TFromD<D>>
  149. HWY_API void SafeFillN(const size_t num, const T value, D d,
  150. T* HWY_RESTRICT to) {
  151. #if HWY_MEM_OPS_MIGHT_FAULT
  152. (void)d;
  153. for (size_t i = 0; i < num; ++i) {
  154. to[i] = value;
  155. }
  156. #else
  157. BlendedStore(Set(d, value), FirstN(d, num), d, to);
  158. #endif
  159. }
  160. // ------------------------------ SafeCopyN
  161. template <class D, typename T = TFromD<D>>
  162. HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
  163. T* HWY_RESTRICT to) {
  164. #if HWY_MEM_OPS_MIGHT_FAULT
  165. (void)d;
  166. for (size_t i = 0; i < num; ++i) {
  167. to[i] = from[i];
  168. }
  169. #else
  170. const Mask<D> mask = FirstN(d, num);
  171. BlendedStore(MaskedLoad(mask, d, from), mask, d, to);
  172. #endif
  173. }
  174. // ------------------------------ IsNegative
  175. #if (defined(HWY_NATIVE_IS_NEGATIVE) == defined(HWY_TARGET_TOGGLE))
  176. #ifdef HWY_NATIVE_IS_NEGATIVE
  177. #undef HWY_NATIVE_IS_NEGATIVE
  178. #else
  179. #define HWY_NATIVE_IS_NEGATIVE
  180. #endif
  181. template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
  182. HWY_API Mask<DFromV<V>> IsNegative(V v) {
  183. const DFromV<decltype(v)> d;
  184. const RebindToSigned<decltype(d)> di;
  185. return RebindMask(d, MaskFromVec(BroadcastSignBit(BitCast(di, v))));
  186. }
  187. #endif // HWY_NATIVE_IS_NEGATIVE
  188. // ------------------------------ MaskFalse
  189. #if (defined(HWY_NATIVE_MASK_FALSE) == defined(HWY_TARGET_TOGGLE))
  190. #ifdef HWY_NATIVE_MASK_FALSE
  191. #undef HWY_NATIVE_MASK_FALSE
  192. #else
  193. #define HWY_NATIVE_MASK_FALSE
  194. #endif
  195. template <class D>
  196. HWY_API Mask<D> MaskFalse(D d) {
  197. return MaskFromVec(Zero(d));
  198. }
  199. #endif // HWY_NATIVE_MASK_FALSE
  200. // ------------------------------ IfNegativeThenElseZero
  201. #if (defined(HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO) == defined(HWY_TARGET_TOGGLE))
  202. #ifdef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
  203. #undef HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
  204. #else
  205. #define HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
  206. #endif
  207. template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
  208. HWY_API V IfNegativeThenElseZero(V v, V yes) {
  209. return IfThenElseZero(IsNegative(v), yes);
  210. }
  211. #endif // HWY_NATIVE_IF_NEG_THEN_ELSE_ZERO
  212. // ------------------------------ IfNegativeThenZeroElse
  213. #if (defined(HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE) == defined(HWY_TARGET_TOGGLE))
  214. #ifdef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
  215. #undef HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
  216. #else
  217. #define HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
  218. #endif
  219. template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
  220. HWY_API V IfNegativeThenZeroElse(V v, V no) {
  221. return IfThenZeroElse(IsNegative(v), no);
  222. }
  223. #endif // HWY_NATIVE_IF_NEG_THEN_ZERO_ELSE
  224. // ------------------------------ ZeroIfNegative (IfNegativeThenZeroElse)
  225. // ZeroIfNegative is generic for all vector lengths
  226. template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
  227. HWY_API V ZeroIfNegative(V v) {
  228. return IfNegativeThenZeroElse(v, v);
  229. }
  230. // ------------------------------ BitwiseIfThenElse
  231. #if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE))
  232. #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
  233. #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
  234. #else
  235. #define HWY_NATIVE_BITWISE_IF_THEN_ELSE
  236. #endif
  237. template <class V>
  238. HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
  239. return Or(And(mask, yes), AndNot(mask, no));
  240. }
  241. #endif // HWY_NATIVE_BITWISE_IF_THEN_ELSE
  242. // ------------------------------ PromoteMaskTo
  243. #if (defined(HWY_NATIVE_PROMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
  244. #ifdef HWY_NATIVE_PROMOTE_MASK_TO
  245. #undef HWY_NATIVE_PROMOTE_MASK_TO
  246. #else
  247. #define HWY_NATIVE_PROMOTE_MASK_TO
  248. #endif
  249. template <class DTo, class DFrom>
  250. HWY_API Mask<DTo> PromoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
  251. static_assert(
  252. sizeof(TFromD<DTo>) > sizeof(TFromD<DFrom>),
  253. "sizeof(TFromD<DTo>) must be greater than sizeof(TFromD<DFrom>)");
  254. static_assert(
  255. IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
  256. "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
  257. const RebindToSigned<decltype(d_to)> di_to;
  258. const RebindToSigned<decltype(d_from)> di_from;
  259. return MaskFromVec(BitCast(
  260. d_to, PromoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
  261. }
  262. #endif // HWY_NATIVE_PROMOTE_MASK_TO
  263. // ------------------------------ DemoteMaskTo
  264. #if (defined(HWY_NATIVE_DEMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE))
  265. #ifdef HWY_NATIVE_DEMOTE_MASK_TO
  266. #undef HWY_NATIVE_DEMOTE_MASK_TO
  267. #else
  268. #define HWY_NATIVE_DEMOTE_MASK_TO
  269. #endif
  270. template <class DTo, class DFrom>
  271. HWY_API Mask<DTo> DemoteMaskTo(DTo d_to, DFrom d_from, Mask<DFrom> m) {
  272. static_assert(sizeof(TFromD<DTo>) < sizeof(TFromD<DFrom>),
  273. "sizeof(TFromD<DTo>) must be less than sizeof(TFromD<DFrom>)");
  274. static_assert(
  275. IsSame<Mask<DFrom>, Mask<Rebind<TFromD<DFrom>, DTo>>>(),
  276. "Mask<DFrom> must be the same type as Mask<Rebind<TFromD<DFrom>, DTo>>");
  277. const RebindToSigned<decltype(d_to)> di_to;
  278. const RebindToSigned<decltype(d_from)> di_from;
  279. return MaskFromVec(
  280. BitCast(d_to, DemoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m)))));
  281. }
  282. #endif // HWY_NATIVE_DEMOTE_MASK_TO
  283. // ------------------------------ CombineMasks
  284. #if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE))
  285. #ifdef HWY_NATIVE_COMBINE_MASKS
  286. #undef HWY_NATIVE_COMBINE_MASKS
  287. #else
  288. #define HWY_NATIVE_COMBINE_MASKS
  289. #endif
  290. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  291. template <class D>
  292. HWY_API Mask<D> CombineMasks(D d, Mask<Half<D>> hi, Mask<Half<D>> lo) {
  293. const Half<decltype(d)> dh;
  294. return MaskFromVec(Combine(d, VecFromMask(dh, hi), VecFromMask(dh, lo)));
  295. }
  296. #endif
  297. #endif // HWY_NATIVE_COMBINE_MASKS
  298. // ------------------------------ LowerHalfOfMask
  299. #if (defined(HWY_NATIVE_LOWER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
  300. #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
  301. #undef HWY_NATIVE_LOWER_HALF_OF_MASK
  302. #else
  303. #define HWY_NATIVE_LOWER_HALF_OF_MASK
  304. #endif
  305. template <class D>
  306. HWY_API Mask<D> LowerHalfOfMask(D d, Mask<Twice<D>> m) {
  307. const Twice<decltype(d)> dt;
  308. return MaskFromVec(LowerHalf(d, VecFromMask(dt, m)));
  309. }
  310. #endif // HWY_NATIVE_LOWER_HALF_OF_MASK
  311. // ------------------------------ UpperHalfOfMask
  312. #if (defined(HWY_NATIVE_UPPER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE))
  313. #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
  314. #undef HWY_NATIVE_UPPER_HALF_OF_MASK
  315. #else
  316. #define HWY_NATIVE_UPPER_HALF_OF_MASK
  317. #endif
  318. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  319. template <class D>
  320. HWY_API Mask<D> UpperHalfOfMask(D d, Mask<Twice<D>> m) {
  321. const Twice<decltype(d)> dt;
  322. return MaskFromVec(UpperHalf(d, VecFromMask(dt, m)));
  323. }
  324. #endif
  325. #endif // HWY_NATIVE_UPPER_HALF_OF_MASK
  326. // ------------------------------ OrderedDemote2MasksTo
  327. #if (defined(HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO) == \
  328. defined(HWY_TARGET_TOGGLE))
  329. #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
  330. #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
  331. #else
  332. #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
  333. #endif
  334. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  335. template <class DTo, class DFrom>
  336. HWY_API Mask<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask<DFrom> a,
  337. Mask<DFrom> b) {
  338. static_assert(
  339. sizeof(TFromD<DTo>) == sizeof(TFromD<DFrom>) / 2,
  340. "sizeof(TFromD<DTo>) must be equal to sizeof(TFromD<DFrom>) / 2");
  341. static_assert(IsSame<Mask<DTo>, Mask<Repartition<TFromD<DTo>, DFrom>>>(),
  342. "Mask<DTo> must be the same type as "
  343. "Mask<Repartition<TFromD<DTo>, DFrom>>>()");
  344. const RebindToSigned<decltype(d_from)> di_from;
  345. const RebindToSigned<decltype(d_to)> di_to;
  346. const auto va = BitCast(di_from, VecFromMask(d_from, a));
  347. const auto vb = BitCast(di_from, VecFromMask(d_from, b));
  348. return MaskFromVec(BitCast(d_to, OrderedDemote2To(di_to, va, vb)));
  349. }
  350. #endif
  351. #endif // HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
  352. // ------------------------------ RotateLeft
  353. template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  354. HWY_API V RotateLeft(V v) {
  355. constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
  356. static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
  357. constexpr int kRotateRightAmt =
  358. (kBits == 0) ? 0 : static_cast<int>(kSizeInBits) - kBits;
  359. return RotateRight<kRotateRightAmt>(v);
  360. }
  361. // ------------------------------ InterleaveWholeLower/InterleaveWholeUpper
  362. #if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE))
  363. #ifdef HWY_NATIVE_INTERLEAVE_WHOLE
  364. #undef HWY_NATIVE_INTERLEAVE_WHOLE
  365. #else
  366. #define HWY_NATIVE_INTERLEAVE_WHOLE
  367. #endif
  368. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  369. template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  370. HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
  371. // InterleaveWholeLower(d, a, b) is equivalent to InterleaveLower(a, b) if
  372. // D().MaxBytes() <= 16 is true
  373. return InterleaveLower(d, a, b);
  374. }
  375. template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  376. HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
  377. // InterleaveWholeUpper(d, a, b) is equivalent to InterleaveUpper(a, b) if
  378. // D().MaxBytes() <= 16 is true
  379. return InterleaveUpper(d, a, b);
  380. }
  381. // InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on AVX2/AVX3
  382. // is implemented in x86_256-inl.h.
  383. // InterleaveWholeLower/InterleaveWholeUpper for 64-byte vectors on AVX3 is
  384. // implemented in x86_512-inl.h.
  385. // InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on WASM_EMU256
  386. // is implemented in wasm_256-inl.h.
  387. #endif // HWY_TARGET != HWY_SCALAR
  388. #endif // HWY_NATIVE_INTERLEAVE_WHOLE
  389. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  390. // The InterleaveWholeLower without the optional D parameter is generic for all
  391. // vector lengths.
  392. template <class V>
  393. HWY_API V InterleaveWholeLower(V a, V b) {
  394. return InterleaveWholeLower(DFromV<V>(), a, b);
  395. }
  396. #endif // HWY_TARGET != HWY_SCALAR
  397. // ------------------------------ InterleaveEven
  398. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  399. // InterleaveEven without the optional D parameter is generic for all vector
  400. // lengths
  401. template <class V>
  402. HWY_API V InterleaveEven(V a, V b) {
  403. return InterleaveEven(DFromV<V>(), a, b);
  404. }
  405. #endif
  406. // ------------------------------ AddSub
  407. template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
  408. HWY_API V AddSub(V a, V b) {
  409. // AddSub(a, b) for a one-lane vector is equivalent to Sub(a, b)
  410. return Sub(a, b);
  411. }
  412. // AddSub for F32x2, F32x4, and F64x2 vectors is implemented in x86_128-inl.h on
  413. // SSSE3/SSE4/AVX2/AVX3
  414. // AddSub for F32x8 and F64x4 vectors is implemented in x86_256-inl.h on
  415. // AVX2/AVX3
  416. // AddSub for F16/F32/F64 vectors on SVE is implemented in arm_sve-inl.h
  417. // AddSub for integer vectors on SVE2 is implemented in arm_sve-inl.h
  418. template <class V, HWY_IF_ADDSUB_V(V)>
  419. HWY_API V AddSub(V a, V b) {
  420. using D = DFromV<decltype(a)>;
  421. using T = TFromD<D>;
  422. using TNegate = If<!hwy::IsSigned<T>(), MakeSigned<T>, T>;
  423. const D d;
  424. const Rebind<TNegate, D> d_negate;
  425. // Negate the even lanes of b
  426. const auto negated_even_b = OddEven(b, BitCast(d, Neg(BitCast(d_negate, b))));
  427. return Add(a, negated_even_b);
  428. }
  429. // ------------------------------ MaskedAddOr etc.
  430. #if (defined(HWY_NATIVE_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE))
  431. #ifdef HWY_NATIVE_MASKED_ARITH
  432. #undef HWY_NATIVE_MASKED_ARITH
  433. #else
  434. #define HWY_NATIVE_MASKED_ARITH
  435. #endif
  436. template <class V, class M>
  437. HWY_API V MaskedMinOr(V no, M m, V a, V b) {
  438. return IfThenElse(m, Min(a, b), no);
  439. }
  440. template <class V, class M>
  441. HWY_API V MaskedMaxOr(V no, M m, V a, V b) {
  442. return IfThenElse(m, Max(a, b), no);
  443. }
  444. template <class V, class M>
  445. HWY_API V MaskedAddOr(V no, M m, V a, V b) {
  446. return IfThenElse(m, Add(a, b), no);
  447. }
  448. template <class V, class M>
  449. HWY_API V MaskedSubOr(V no, M m, V a, V b) {
  450. return IfThenElse(m, Sub(a, b), no);
  451. }
  452. template <class V, class M>
  453. HWY_API V MaskedMulOr(V no, M m, V a, V b) {
  454. return IfThenElse(m, Mul(a, b), no);
  455. }
  456. template <class V, class M>
  457. HWY_API V MaskedDivOr(V no, M m, V a, V b) {
  458. return IfThenElse(m, Div(a, b), no);
  459. }
  460. template <class V, class M>
  461. HWY_API V MaskedModOr(V no, M m, V a, V b) {
  462. return IfThenElse(m, Mod(a, b), no);
  463. }
  464. template <class V, class M>
  465. HWY_API V MaskedSatAddOr(V no, M m, V a, V b) {
  466. return IfThenElse(m, SaturatedAdd(a, b), no);
  467. }
  468. template <class V, class M>
  469. HWY_API V MaskedSatSubOr(V no, M m, V a, V b) {
  470. return IfThenElse(m, SaturatedSub(a, b), no);
  471. }
  472. #endif // HWY_NATIVE_MASKED_ARITH
  473. // ------------------------------ IfNegativeThenNegOrUndefIfZero
  474. #if (defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \
  475. defined(HWY_TARGET_TOGGLE))
  476. #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
  477. #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
  478. #else
  479. #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
  480. #endif
  481. template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  482. HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
  483. #if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE
  484. // MaskedSubOr is more efficient than IfNegativeThenElse on RVV/SVE
  485. const auto zero = Zero(DFromV<V>());
  486. return MaskedSubOr(v, Lt(mask, zero), zero, v);
  487. #else
  488. return IfNegativeThenElse(mask, Neg(v), v);
  489. #endif
  490. }
  491. #endif // HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG
  492. template <class V, HWY_IF_FLOAT_V(V)>
  493. HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
  494. return CopySign(v, Xor(mask, v));
  495. }
  496. // ------------------------------ SaturatedNeg
  497. #if (defined(HWY_NATIVE_SATURATED_NEG_8_16_32) == defined(HWY_TARGET_TOGGLE))
  498. #ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
  499. #undef HWY_NATIVE_SATURATED_NEG_8_16_32
  500. #else
  501. #define HWY_NATIVE_SATURATED_NEG_8_16_32
  502. #endif
  503. template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
  504. HWY_IF_SIGNED_V(V)>
  505. HWY_API V SaturatedNeg(V v) {
  506. const DFromV<decltype(v)> d;
  507. return SaturatedSub(Zero(d), v);
  508. }
  509. template <class V, HWY_IF_I32(TFromV<V>)>
  510. HWY_API V SaturatedNeg(V v) {
  511. const DFromV<decltype(v)> d;
  512. #if HWY_TARGET == HWY_RVV || HWY_TARGET_IS_PPC || HWY_TARGET_IS_SVE || \
  513. HWY_TARGET_IS_NEON
  514. // RVV/PPC/SVE/NEON have native I32 SaturatedSub instructions
  515. return SaturatedSub(Zero(d), v);
  516. #else
  517. // ~v[i] - ((v[i] > LimitsMin<int32_t>()) ? -1 : 0) is equivalent to
  518. // (v[i] > LimitsMin<int32_t>) ? (-v[i]) : LimitsMax<int32_t>() since
  519. // -v[i] == ~v[i] + 1 == ~v[i] - (-1) and
  520. // ~LimitsMin<int32_t>() == LimitsMax<int32_t>().
  521. return Sub(Not(v), VecFromMask(d, Gt(v, Set(d, LimitsMin<int32_t>()))));
  522. #endif
  523. }
  524. #endif // HWY_NATIVE_SATURATED_NEG_8_16_32
  525. #if (defined(HWY_NATIVE_SATURATED_NEG_64) == defined(HWY_TARGET_TOGGLE))
  526. #ifdef HWY_NATIVE_SATURATED_NEG_64
  527. #undef HWY_NATIVE_SATURATED_NEG_64
  528. #else
  529. #define HWY_NATIVE_SATURATED_NEG_64
  530. #endif
  531. template <class V, HWY_IF_I64(TFromV<V>)>
  532. HWY_API V SaturatedNeg(V v) {
  533. #if HWY_TARGET == HWY_RVV || HWY_TARGET_IS_SVE || HWY_TARGET_IS_NEON
  534. // RVV/SVE/NEON have native I64 SaturatedSub instructions
  535. const DFromV<decltype(v)> d;
  536. return SaturatedSub(Zero(d), v);
  537. #else
  538. const auto neg_v = Neg(v);
  539. return Add(neg_v, BroadcastSignBit(And(v, neg_v)));
  540. #endif
  541. }
  542. #endif // HWY_NATIVE_SATURATED_NEG_64
  543. // ------------------------------ SaturatedAbs
  544. #if (defined(HWY_NATIVE_SATURATED_ABS) == defined(HWY_TARGET_TOGGLE))
  545. #ifdef HWY_NATIVE_SATURATED_ABS
  546. #undef HWY_NATIVE_SATURATED_ABS
  547. #else
  548. #define HWY_NATIVE_SATURATED_ABS
  549. #endif
  550. template <class V, HWY_IF_SIGNED_V(V)>
  551. HWY_API V SaturatedAbs(V v) {
  552. return Max(v, SaturatedNeg(v));
  553. }
  554. #endif
  555. // ------------------------------ Reductions
  556. // Targets follow one of two strategies. If HWY_NATIVE_REDUCE_SCALAR is toggled,
  557. // they (RVV/SVE/Armv8/Emu128) implement ReduceSum and SumOfLanes via Set.
  558. // Otherwise, they (Armv7/PPC/scalar/WASM/x86) define zero to most of the
  559. // SumOfLanes overloads. For the latter group, we here define the remaining
  560. // overloads, plus ReduceSum which uses them plus GetLane.
  561. #if (defined(HWY_NATIVE_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE))
  562. #ifdef HWY_NATIVE_REDUCE_SCALAR
  563. #undef HWY_NATIVE_REDUCE_SCALAR
  564. #else
  565. #define HWY_NATIVE_REDUCE_SCALAR
  566. #endif
  567. namespace detail {
  568. // Allows reusing the same shuffle code for SumOfLanes/MinOfLanes/MaxOfLanes.
  569. struct AddFunc {
  570. template <class V>
  571. V operator()(V a, V b) const {
  572. return Add(a, b);
  573. }
  574. };
  575. struct MinFunc {
  576. template <class V>
  577. V operator()(V a, V b) const {
  578. return Min(a, b);
  579. }
  580. };
  581. struct MaxFunc {
  582. template <class V>
  583. V operator()(V a, V b) const {
  584. return Max(a, b);
  585. }
  586. };
  587. // No-op for vectors of at most one block.
  588. template <class D, class Func, HWY_IF_V_SIZE_LE_D(D, 16)>
  589. HWY_INLINE VFromD<D> ReduceAcrossBlocks(D, Func, VFromD<D> v) {
  590. return v;
  591. }
  592. // Reduces a lane with its counterpart in other block(s). Shared by AVX2 and
  593. // WASM_EMU256. AVX3 has its own overload.
  594. template <class D, class Func, HWY_IF_V_SIZE_D(D, 32)>
  595. HWY_INLINE VFromD<D> ReduceAcrossBlocks(D /*d*/, Func f, VFromD<D> v) {
  596. return f(v, SwapAdjacentBlocks(v));
  597. }
  598. // These return the reduction result broadcasted across all lanes. They assume
  599. // the caller has already reduced across blocks.
  600. template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
  601. HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v10) {
  602. return f(v10, Reverse2(d, v10));
  603. }
  604. template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
  605. HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v3210) {
  606. const VFromD<D> v0123 = Reverse4(d, v3210);
  607. const VFromD<D> v03_12_12_03 = f(v3210, v0123);
  608. const VFromD<D> v12_03_03_12 = Reverse2(d, v03_12_12_03);
  609. return f(v03_12_12_03, v12_03_03_12);
  610. }
  611. template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
  612. HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v76543210) {
  613. // The upper half is reversed from the lower half; omit for brevity.
  614. const VFromD<D> v34_25_16_07 = f(v76543210, Reverse8(d, v76543210));
  615. const VFromD<D> v0347_1625_1625_0347 =
  616. f(v34_25_16_07, Reverse4(d, v34_25_16_07));
  617. return f(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347));
  618. }
  619. template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_U8_D(D)>
  620. HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) {
  621. const RepartitionToWide<decltype(d)> dw;
  622. using VW = VFromD<decltype(dw)>;
  623. const VW vw = BitCast(dw, v);
  624. // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
  625. const VW even = And(vw, Set(dw, 0xFF));
  626. const VW odd = ShiftRight<8>(vw);
  627. const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
  628. #if HWY_IS_LITTLE_ENDIAN
  629. return DupEven(BitCast(d, reduced));
  630. #else
  631. return DupOdd(BitCast(d, reduced));
  632. #endif
  633. }
  634. template <class D, class Func, HWY_IF_LANES_PER_BLOCK_D(D, 16), HWY_IF_I8_D(D)>
  635. HWY_INLINE VFromD<D> ReduceWithinBlocks(D d, Func f, VFromD<D> v) {
  636. const RepartitionToWide<decltype(d)> dw;
  637. using VW = VFromD<decltype(dw)>;
  638. const VW vw = BitCast(dw, v);
  639. // Sign-extend
  640. // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN.
  641. const VW even = ShiftRight<8>(ShiftLeft<8>(vw));
  642. const VW odd = ShiftRight<8>(vw);
  643. const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd));
  644. #if HWY_IS_LITTLE_ENDIAN
  645. return DupEven(BitCast(d, reduced));
  646. #else
  647. return DupOdd(BitCast(d, reduced));
  648. #endif
  649. }
  650. } // namespace detail
  651. template <class D, HWY_IF_SUM_OF_LANES_D(D)>
  652. HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
  653. const detail::AddFunc f;
  654. v = detail::ReduceAcrossBlocks(d, f, v);
  655. return detail::ReduceWithinBlocks(d, f, v);
  656. }
  657. template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
  658. HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
  659. const detail::MinFunc f;
  660. v = detail::ReduceAcrossBlocks(d, f, v);
  661. return detail::ReduceWithinBlocks(d, f, v);
  662. }
  663. template <class D, HWY_IF_MINMAX_OF_LANES_D(D)>
  664. HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
  665. const detail::MaxFunc f;
  666. v = detail::ReduceAcrossBlocks(d, f, v);
  667. return detail::ReduceWithinBlocks(d, f, v);
  668. }
  669. template <class D, HWY_IF_REDUCE_D(D)>
  670. HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
  671. return GetLane(SumOfLanes(d, v));
  672. }
  673. template <class D, HWY_IF_REDUCE_D(D)>
  674. HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
  675. return GetLane(MinOfLanes(d, v));
  676. }
  677. template <class D, HWY_IF_REDUCE_D(D)>
  678. HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
  679. return GetLane(MaxOfLanes(d, v));
  680. }
  681. #endif // HWY_NATIVE_REDUCE_SCALAR
  682. // Corner cases for both generic and native implementations:
  683. // N=1 (native covers N=2 e.g. for u64x2 and even u32x2 on Arm)
  684. template <class D, HWY_IF_LANES_D(D, 1)>
  685. HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
  686. return GetLane(v);
  687. }
  688. template <class D, HWY_IF_LANES_D(D, 1)>
  689. HWY_API TFromD<D> ReduceMin(D /*d*/, VFromD<D> v) {
  690. return GetLane(v);
  691. }
  692. template <class D, HWY_IF_LANES_D(D, 1)>
  693. HWY_API TFromD<D> ReduceMax(D /*d*/, VFromD<D> v) {
  694. return GetLane(v);
  695. }
  696. template <class D, HWY_IF_LANES_D(D, 1)>
  697. HWY_API VFromD<D> SumOfLanes(D /* tag */, VFromD<D> v) {
  698. return v;
  699. }
  700. template <class D, HWY_IF_LANES_D(D, 1)>
  701. HWY_API VFromD<D> MinOfLanes(D /* tag */, VFromD<D> v) {
  702. return v;
  703. }
  704. template <class D, HWY_IF_LANES_D(D, 1)>
  705. HWY_API VFromD<D> MaxOfLanes(D /* tag */, VFromD<D> v) {
  706. return v;
  707. }
  708. // N=4 for 8-bit is still less than the minimum native size.
  709. // ARMv7 NEON/PPC/RVV/SVE have target-specific implementations of the N=4 I8/U8
  710. // ReduceSum operations
  711. #if (defined(HWY_NATIVE_REDUCE_SUM_4_UI8) == defined(HWY_TARGET_TOGGLE))
  712. #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
  713. #undef HWY_NATIVE_REDUCE_SUM_4_UI8
  714. #else
  715. #define HWY_NATIVE_REDUCE_SUM_4_UI8
  716. #endif
  717. template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
  718. HWY_API TFromD<D> ReduceSum(D d, VFromD<D> v) {
  719. const Twice<RepartitionToWide<decltype(d)>> dw;
  720. return static_cast<TFromD<D>>(ReduceSum(dw, PromoteTo(dw, v)));
  721. }
  722. #endif // HWY_NATIVE_REDUCE_SUM_4_UI8
  723. // RVV/SVE have target-specific implementations of the N=4 I8/U8
  724. // ReduceMin/ReduceMax operations
  725. #if (defined(HWY_NATIVE_REDUCE_MINMAX_4_UI8) == defined(HWY_TARGET_TOGGLE))
  726. #ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
  727. #undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
  728. #else
  729. #define HWY_NATIVE_REDUCE_MINMAX_4_UI8
  730. #endif
  731. template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
  732. HWY_API TFromD<D> ReduceMin(D d, VFromD<D> v) {
  733. const Twice<RepartitionToWide<decltype(d)>> dw;
  734. return static_cast<TFromD<D>>(ReduceMin(dw, PromoteTo(dw, v)));
  735. }
  736. template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
  737. HWY_API TFromD<D> ReduceMax(D d, VFromD<D> v) {
  738. const Twice<RepartitionToWide<decltype(d)>> dw;
  739. return static_cast<TFromD<D>>(ReduceMax(dw, PromoteTo(dw, v)));
  740. }
  741. #endif // HWY_NATIVE_REDUCE_MINMAX_4_UI8
  742. // ------------------------------ IsEitherNaN
  743. #if (defined(HWY_NATIVE_IS_EITHER_NAN) == defined(HWY_TARGET_TOGGLE))
  744. #ifdef HWY_NATIVE_IS_EITHER_NAN
  745. #undef HWY_NATIVE_IS_EITHER_NAN
  746. #else
  747. #define HWY_NATIVE_IS_EITHER_NAN
  748. #endif
  749. template <class V, HWY_IF_FLOAT_V(V)>
  750. HWY_API MFromD<DFromV<V>> IsEitherNaN(V a, V b) {
  751. return Or(IsNaN(a), IsNaN(b));
  752. }
  753. #endif // HWY_NATIVE_IS_EITHER_NAN
  754. // ------------------------------ IsInf, IsFinite
  755. // AVX3 has target-specific implementations of these.
  756. #if (defined(HWY_NATIVE_ISINF) == defined(HWY_TARGET_TOGGLE))
  757. #ifdef HWY_NATIVE_ISINF
  758. #undef HWY_NATIVE_ISINF
  759. #else
  760. #define HWY_NATIVE_ISINF
  761. #endif
  762. template <class V, class D = DFromV<V>>
  763. HWY_API MFromD<D> IsInf(const V v) {
  764. using T = TFromD<D>;
  765. const D d;
  766. const RebindToUnsigned<decltype(d)> du;
  767. const VFromD<decltype(du)> vu = BitCast(du, v);
  768. // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
  769. return RebindMask(
  770. d,
  771. Eq(Add(vu, vu),
  772. Set(du, static_cast<MakeUnsigned<T>>(hwy::MaxExponentTimes2<T>()))));
  773. }
  774. // Returns whether normal/subnormal/zero.
  775. template <class V, class D = DFromV<V>>
  776. HWY_API MFromD<D> IsFinite(const V v) {
  777. using T = TFromD<D>;
  778. const D d;
  779. const RebindToUnsigned<decltype(d)> du;
  780. const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
  781. const VFromD<decltype(du)> vu = BitCast(du, v);
  782. // 'Shift left' to clear the sign bit. MSVC seems to generate incorrect code
  783. // for AVX2 if we instead add vu + vu.
  784. #if HWY_COMPILER_MSVC
  785. const VFromD<decltype(du)> shl = ShiftLeft<1>(vu);
  786. #else
  787. const VFromD<decltype(du)> shl = Add(vu, vu);
  788. #endif
  789. // Then shift right so we can compare with the max exponent (cannot compare
  790. // with MaxExponentTimes2 directly because it is negative and non-negative
  791. // floats would be greater).
  792. const VFromD<decltype(di)> exp =
  793. BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(shl));
  794. return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
  795. }
  796. #endif // HWY_NATIVE_ISINF
  797. // ------------------------------ LoadInterleaved2
  798. #if HWY_IDE || \
  799. (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
  800. #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
  801. #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
  802. #else
  803. #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
  804. #endif
  805. template <class D, HWY_IF_LANES_GT_D(D, 1)>
  806. HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned,
  807. VFromD<D>& v0, VFromD<D>& v1) {
  808. const VFromD<D> A = LoadU(d, unaligned); // v1[1] v0[1] v1[0] v0[0]
  809. const VFromD<D> B = LoadU(d, unaligned + Lanes(d));
  810. v0 = ConcatEven(d, B, A);
  811. v1 = ConcatOdd(d, B, A);
  812. }
  813. template <class D, HWY_IF_LANES_D(D, 1)>
  814. HWY_API void LoadInterleaved2(D d, const TFromD<D>* HWY_RESTRICT unaligned,
  815. VFromD<D>& v0, VFromD<D>& v1) {
  816. v0 = LoadU(d, unaligned + 0);
  817. v1 = LoadU(d, unaligned + 1);
  818. }
  819. // ------------------------------ LoadInterleaved3 (CombineShiftRightBytes)
  820. namespace detail {
  821. #if HWY_IDE
  822. template <class V>
  823. HWY_INLINE V ShuffleTwo1230(V a, V /* b */) {
  824. return a;
  825. }
  826. template <class V>
  827. HWY_INLINE V ShuffleTwo2301(V a, V /* b */) {
  828. return a;
  829. }
  830. template <class V>
  831. HWY_INLINE V ShuffleTwo3012(V a, V /* b */) {
  832. return a;
  833. }
  834. #endif // HWY_IDE
  835. // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
  836. template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  837. HWY_INLINE void LoadTransposedBlocks3(D d,
  838. const TFromD<D>* HWY_RESTRICT unaligned,
  839. VFromD<D>& A, VFromD<D>& B,
  840. VFromD<D>& C) {
  841. constexpr size_t kN = MaxLanes(d);
  842. A = LoadU(d, unaligned + 0 * kN);
  843. B = LoadU(d, unaligned + 1 * kN);
  844. C = LoadU(d, unaligned + 2 * kN);
  845. }
  846. } // namespace detail
  847. template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
  848. HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
  849. VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  850. const RebindToUnsigned<decltype(d)> du;
  851. using V = VFromD<D>;
  852. using VU = VFromD<decltype(du)>;
  853. // Compact notation so these fit on one line: 12 := v1[2].
  854. V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00
  855. V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15
  856. V C; // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a
  857. detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
  858. // Compress all lanes belonging to v0 into consecutive lanes.
  859. constexpr uint8_t Z = 0x80;
  860. const VU idx_v0A =
  861. Dup128VecFromValues(du, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
  862. const VU idx_v0B =
  863. Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z);
  864. const VU idx_v0C =
  865. Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13);
  866. const VU idx_v1A =
  867. Dup128VecFromValues(du, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
  868. const VU idx_v1B =
  869. Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z);
  870. const VU idx_v1C =
  871. Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14);
  872. const VU idx_v2A =
  873. Dup128VecFromValues(du, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
  874. const VU idx_v2B =
  875. Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z);
  876. const VU idx_v2C =
  877. Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15);
  878. const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
  879. const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
  880. const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
  881. const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
  882. const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
  883. const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
  884. const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
  885. const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
  886. const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
  887. v0 = Xor3(v0L, v0M, v0U);
  888. v1 = Xor3(v1L, v1M, v1U);
  889. v2 = Xor3(v2L, v2M, v2U);
  890. }
  891. // 8-bit lanes x8
  892. template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
  893. HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
  894. VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  895. const RebindToUnsigned<decltype(d)> du;
  896. using V = VFromD<D>;
  897. using VU = VFromD<decltype(du)>;
  898. V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
  899. V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
  900. V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
  901. detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
  902. // Compress all lanes belonging to v0 into consecutive lanes.
  903. constexpr uint8_t Z = 0x80;
  904. const VU idx_v0A =
  905. Dup128VecFromValues(du, 0, 3, 6, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
  906. const VU idx_v0B =
  907. Dup128VecFromValues(du, Z, Z, Z, 1, 4, 7, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
  908. const VU idx_v0C =
  909. Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0);
  910. const VU idx_v1A =
  911. Dup128VecFromValues(du, 1, 4, 7, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
  912. const VU idx_v1B =
  913. Dup128VecFromValues(du, Z, Z, Z, 2, 5, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
  914. const VU idx_v1C =
  915. Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 0);
  916. const VU idx_v2A =
  917. Dup128VecFromValues(du, 2, 5, Z, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
  918. const VU idx_v2B =
  919. Dup128VecFromValues(du, Z, Z, 0, 3, 6, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0);
  920. const VU idx_v2C =
  921. Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0);
  922. const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A));
  923. const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B));
  924. const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C));
  925. const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A));
  926. const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B));
  927. const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C));
  928. const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A));
  929. const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B));
  930. const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C));
  931. v0 = Xor3(v0L, v0M, v0U);
  932. v1 = Xor3(v1L, v1M, v1U);
  933. v2 = Xor3(v2L, v2M, v2U);
  934. }
  935. // 16-bit lanes x8
  936. template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
  937. HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
  938. VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  939. const RebindToUnsigned<decltype(d)> du;
  940. const Repartition<uint8_t, decltype(du)> du8;
  941. using V = VFromD<D>;
  942. using VU8 = VFromD<decltype(du8)>;
  943. V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
  944. V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
  945. V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
  946. detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
  947. // Compress all lanes belonging to v0 into consecutive lanes. Same as above,
  948. // but each element of the array contains a byte index for a byte of a lane.
  949. constexpr uint8_t Z = 0x80;
  950. const VU8 idx_v0A = Dup128VecFromValues(du8, 0x00, 0x01, 0x06, 0x07, 0x0C,
  951. 0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
  952. const VU8 idx_v0B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x02, 0x03,
  953. 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z);
  954. const VU8 idx_v0C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
  955. Z, 0x04, 0x05, 0x0A, 0x0B);
  956. const VU8 idx_v1A = Dup128VecFromValues(du8, 0x02, 0x03, 0x08, 0x09, 0x0E,
  957. 0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z);
  958. const VU8 idx_v1B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x04, 0x05,
  959. 0x0A, 0x0B, Z, Z, Z, Z, Z, Z);
  960. const VU8 idx_v1C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
  961. 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D);
  962. const VU8 idx_v2A = Dup128VecFromValues(du8, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z,
  963. Z, Z, Z, Z, Z, Z, Z, Z, Z);
  964. const VU8 idx_v2B = Dup128VecFromValues(du8, Z, Z, Z, Z, 0x00, 0x01, 0x06,
  965. 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z);
  966. const VU8 idx_v2C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z,
  967. 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F);
  968. const V v0L = TableLookupBytesOr0(A, BitCast(d, idx_v0A));
  969. const V v0M = TableLookupBytesOr0(B, BitCast(d, idx_v0B));
  970. const V v0U = TableLookupBytesOr0(C, BitCast(d, idx_v0C));
  971. const V v1L = TableLookupBytesOr0(A, BitCast(d, idx_v1A));
  972. const V v1M = TableLookupBytesOr0(B, BitCast(d, idx_v1B));
  973. const V v1U = TableLookupBytesOr0(C, BitCast(d, idx_v1C));
  974. const V v2L = TableLookupBytesOr0(A, BitCast(d, idx_v2A));
  975. const V v2M = TableLookupBytesOr0(B, BitCast(d, idx_v2B));
  976. const V v2U = TableLookupBytesOr0(C, BitCast(d, idx_v2C));
  977. v0 = Xor3(v0L, v0M, v0U);
  978. v1 = Xor3(v1L, v1M, v1U);
  979. v2 = Xor3(v2L, v2M, v2U);
  980. }
  981. template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
  982. HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
  983. VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  984. using V = VFromD<D>;
  985. V A; // v0[1] v2[0] v1[0] v0[0]
  986. V B; // v1[2] v0[2] v2[1] v1[1]
  987. V C; // v2[3] v1[3] v0[3] v2[2]
  988. detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
  989. const V vxx_02_03_xx = OddEven(C, B);
  990. v0 = detail::ShuffleTwo1230(A, vxx_02_03_xx);
  991. // Shuffle2301 takes the upper/lower halves of the output from one input, so
  992. // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use
  993. // OddEven because it may have higher throughput than Shuffle.
  994. const V vxx_xx_10_11 = OddEven(A, B);
  995. const V v12_13_xx_xx = OddEven(B, C);
  996. v1 = detail::ShuffleTwo2301(vxx_xx_10_11, v12_13_xx_xx);
  997. const V vxx_20_21_xx = OddEven(B, A);
  998. v2 = detail::ShuffleTwo3012(vxx_20_21_xx, C);
  999. }
  1000. template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
  1001. HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
  1002. VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  1003. VFromD<D> A; // v1[0] v0[0]
  1004. VFromD<D> B; // v0[1] v2[0]
  1005. VFromD<D> C; // v2[1] v1[1]
  1006. detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
  1007. v0 = OddEven(B, A);
  1008. v1 = CombineShiftRightBytes<sizeof(TFromD<D>)>(d, C, A);
  1009. v2 = OddEven(C, B);
  1010. }
  1011. template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
  1012. HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
  1013. VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  1014. v0 = LoadU(d, unaligned + 0);
  1015. v1 = LoadU(d, unaligned + 1);
  1016. v2 = LoadU(d, unaligned + 2);
  1017. }
  1018. // ------------------------------ LoadInterleaved4
  1019. namespace detail {
  1020. // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
  1021. template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  1022. HWY_INLINE void LoadTransposedBlocks4(D d,
  1023. const TFromD<D>* HWY_RESTRICT unaligned,
  1024. VFromD<D>& vA, VFromD<D>& vB,
  1025. VFromD<D>& vC, VFromD<D>& vD) {
  1026. constexpr size_t kN = MaxLanes(d);
  1027. vA = LoadU(d, unaligned + 0 * kN);
  1028. vB = LoadU(d, unaligned + 1 * kN);
  1029. vC = LoadU(d, unaligned + 2 * kN);
  1030. vD = LoadU(d, unaligned + 3 * kN);
  1031. }
  1032. } // namespace detail
  1033. template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 16)>
  1034. HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
  1035. VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
  1036. VFromD<D>& v3) {
  1037. const Repartition<uint64_t, decltype(d)> d64;
  1038. using V64 = VFromD<decltype(d64)>;
  1039. using V = VFromD<D>;
  1040. // 16 lanes per block; the lowest four blocks are at the bottom of vA..vD.
  1041. // Here int[i] means the four interleaved values of the i-th 4-tuple and
  1042. // int[3..0] indicates four consecutive 4-tuples (0 = least-significant).
  1043. V vA; // int[13..10] int[3..0]
  1044. V vB; // int[17..14] int[7..4]
  1045. V vC; // int[1b..18] int[b..8]
  1046. V vD; // int[1f..1c] int[f..c]
  1047. detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
  1048. // For brevity, the comments only list the lower block (upper = lower + 0x10)
  1049. const V v5140 = InterleaveLower(d, vA, vB); // int[5,1,4,0]
  1050. const V vd9c8 = InterleaveLower(d, vC, vD); // int[d,9,c,8]
  1051. const V v7362 = InterleaveUpper(d, vA, vB); // int[7,3,6,2]
  1052. const V vfbea = InterleaveUpper(d, vC, vD); // int[f,b,e,a]
  1053. const V v6420 = InterleaveLower(d, v5140, v7362); // int[6,4,2,0]
  1054. const V veca8 = InterleaveLower(d, vd9c8, vfbea); // int[e,c,a,8]
  1055. const V v7531 = InterleaveUpper(d, v5140, v7362); // int[7,5,3,1]
  1056. const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea); // int[f,d,b,9]
  1057. const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531)); // v10[7..0]
  1058. const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9)); // v10[f..8]
  1059. const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531)); // v32[7..0]
  1060. const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9)); // v32[f..8]
  1061. v0 = BitCast(d, InterleaveLower(d64, v10L, v10U));
  1062. v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U));
  1063. v2 = BitCast(d, InterleaveLower(d64, v32L, v32U));
  1064. v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U));
  1065. }
  1066. template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 8)>
  1067. HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
  1068. VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
  1069. VFromD<D>& v3) {
  1070. // In the last step, we interleave by half of the block size, which is usually
  1071. // 8 bytes but half that for 8-bit x8 vectors.
  1072. using TW = hwy::UnsignedFromSize<d.MaxBytes() == 8 ? 4 : 8>;
  1073. const Repartition<TW, decltype(d)> dw;
  1074. using VW = VFromD<decltype(dw)>;
  1075. // (Comments are for 256-bit vectors.)
  1076. // 8 lanes per block; the lowest four blocks are at the bottom of vA..vD.
  1077. VFromD<D> vA; // v3210[9]v3210[8] v3210[1]v3210[0]
  1078. VFromD<D> vB; // v3210[b]v3210[a] v3210[3]v3210[2]
  1079. VFromD<D> vC; // v3210[d]v3210[c] v3210[5]v3210[4]
  1080. VFromD<D> vD; // v3210[f]v3210[e] v3210[7]v3210[6]
  1081. detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
  1082. const VFromD<D> va820 = InterleaveLower(d, vA, vB); // v3210[a,8] v3210[2,0]
  1083. const VFromD<D> vec64 = InterleaveLower(d, vC, vD); // v3210[e,c] v3210[6,4]
  1084. const VFromD<D> vb931 = InterleaveUpper(d, vA, vB); // v3210[b,9] v3210[3,1]
  1085. const VFromD<D> vfd75 = InterleaveUpper(d, vC, vD); // v3210[f,d] v3210[7,5]
  1086. const VW v10_b830 = // v10[b..8] v10[3..0]
  1087. BitCast(dw, InterleaveLower(d, va820, vb931));
  1088. const VW v10_fc74 = // v10[f..c] v10[7..4]
  1089. BitCast(dw, InterleaveLower(d, vec64, vfd75));
  1090. const VW v32_b830 = // v32[b..8] v32[3..0]
  1091. BitCast(dw, InterleaveUpper(d, va820, vb931));
  1092. const VW v32_fc74 = // v32[f..c] v32[7..4]
  1093. BitCast(dw, InterleaveUpper(d, vec64, vfd75));
  1094. v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74));
  1095. v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74));
  1096. v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74));
  1097. v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74));
  1098. }
  1099. template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 4)>
  1100. HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
  1101. VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
  1102. VFromD<D>& v3) {
  1103. using V = VFromD<D>;
  1104. V vA; // v3210[4] v3210[0]
  1105. V vB; // v3210[5] v3210[1]
  1106. V vC; // v3210[6] v3210[2]
  1107. V vD; // v3210[7] v3210[3]
  1108. detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
  1109. const V v10e = InterleaveLower(d, vA, vC); // v1[6,4] v0[6,4] v1[2,0] v0[2,0]
  1110. const V v10o = InterleaveLower(d, vB, vD); // v1[7,5] v0[7,5] v1[3,1] v0[3,1]
  1111. const V v32e = InterleaveUpper(d, vA, vC); // v3[6,4] v2[6,4] v3[2,0] v2[2,0]
  1112. const V v32o = InterleaveUpper(d, vB, vD); // v3[7,5] v2[7,5] v3[3,1] v2[3,1]
  1113. v0 = InterleaveLower(d, v10e, v10o);
  1114. v1 = InterleaveUpper(d, v10e, v10o);
  1115. v2 = InterleaveLower(d, v32e, v32o);
  1116. v3 = InterleaveUpper(d, v32e, v32o);
  1117. }
  1118. template <class D, HWY_IF_LANES_PER_BLOCK_D(D, 2)>
  1119. HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
  1120. VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
  1121. VFromD<D>& v3) {
  1122. VFromD<D> vA, vB, vC, vD;
  1123. detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD);
  1124. v0 = InterleaveLower(d, vA, vC);
  1125. v1 = InterleaveUpper(d, vA, vC);
  1126. v2 = InterleaveLower(d, vB, vD);
  1127. v3 = InterleaveUpper(d, vB, vD);
  1128. }
  1129. // Any T x1
  1130. template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
  1131. HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
  1132. VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
  1133. VFromD<D>& v3) {
  1134. v0 = LoadU(d, unaligned + 0);
  1135. v1 = LoadU(d, unaligned + 1);
  1136. v2 = LoadU(d, unaligned + 2);
  1137. v3 = LoadU(d, unaligned + 3);
  1138. }
  1139. // ------------------------------ StoreInterleaved2
  1140. namespace detail {
  1141. // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
  1142. template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  1143. HWY_INLINE void StoreTransposedBlocks2(VFromD<D> A, VFromD<D> B, D d,
  1144. TFromD<D>* HWY_RESTRICT unaligned) {
  1145. constexpr size_t kN = MaxLanes(d);
  1146. StoreU(A, d, unaligned + 0 * kN);
  1147. StoreU(B, d, unaligned + 1 * kN);
  1148. }
  1149. } // namespace detail
  1150. // >= 128 bit vector
  1151. template <class D, HWY_IF_V_SIZE_GT_D(D, 8)>
  1152. HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
  1153. TFromD<D>* HWY_RESTRICT unaligned) {
  1154. const auto v10L = InterleaveLower(d, v0, v1); // .. v1[0] v0[0]
  1155. const auto v10U = InterleaveUpper(d, v0, v1); // .. v1[kN/2] v0[kN/2]
  1156. detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned);
  1157. }
  1158. // <= 64 bits
  1159. template <class V, class D, HWY_IF_V_SIZE_LE_D(D, 8)>
  1160. HWY_API void StoreInterleaved2(V part0, V part1, D d,
  1161. TFromD<D>* HWY_RESTRICT unaligned) {
  1162. const Twice<decltype(d)> d2;
  1163. const auto v0 = ZeroExtendVector(d2, part0);
  1164. const auto v1 = ZeroExtendVector(d2, part1);
  1165. const auto v10 = InterleaveLower(d2, v0, v1);
  1166. StoreU(v10, d2, unaligned);
  1167. }
  1168. // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
  1169. // TableLookupBytes)
  1170. namespace detail {
  1171. // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
  1172. template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  1173. HWY_INLINE void StoreTransposedBlocks3(VFromD<D> A, VFromD<D> B, VFromD<D> C,
  1174. D d, TFromD<D>* HWY_RESTRICT unaligned) {
  1175. constexpr size_t kN = MaxLanes(d);
  1176. StoreU(A, d, unaligned + 0 * kN);
  1177. StoreU(B, d, unaligned + 1 * kN);
  1178. StoreU(C, d, unaligned + 2 * kN);
  1179. }
  1180. } // namespace detail
  1181. // >= 128-bit vector, 8-bit lanes
  1182. template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_GT_D(D, 8)>
  1183. HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
  1184. TFromD<D>* HWY_RESTRICT unaligned) {
  1185. const RebindToUnsigned<decltype(d)> du;
  1186. using TU = TFromD<decltype(du)>;
  1187. using VU = VFromD<decltype(du)>;
  1188. const VU k5 = Set(du, TU{5});
  1189. const VU k6 = Set(du, TU{6});
  1190. // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
  1191. // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
  1192. // to their place, with 0x80 so lanes to be filled from other vectors are 0
  1193. // to enable blending by ORing together.
  1194. const VFromD<decltype(du)> shuf_A0 =
  1195. Dup128VecFromValues(du, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3,
  1196. 0x80, 0x80, 4, 0x80, 0x80, 5);
  1197. // Cannot reuse shuf_A0 because it contains 5.
  1198. const VFromD<decltype(du)> shuf_A1 =
  1199. Dup128VecFromValues(du, 0x80, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
  1200. 3, 0x80, 0x80, 4, 0x80, 0x80);
  1201. // The interleaved vectors will be named A, B, C; temporaries with suffix
  1202. // 0..2 indicate which input vector's lanes they hold.
  1203. // cannot reuse shuf_A0 (has 5)
  1204. const VU shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
  1205. const VU vA0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
  1206. const VU vA1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
  1207. const VU vA2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
  1208. const VFromD<D> A = BitCast(d, vA0 | vA1 | vA2);
  1209. // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5]
  1210. const VU shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6..
  1211. const VU shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5
  1212. const VU shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5.
  1213. const VU vB0 = TableLookupBytesOr0(v0, shuf_B0);
  1214. const VU vB1 = TableLookupBytesOr0(v1, shuf_B1);
  1215. const VU vB2 = TableLookupBytesOr0(v2, shuf_B2);
  1216. const VFromD<D> B = BitCast(d, vB0 | vB1 | vB2);
  1217. // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10]
  1218. const VU shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B.
  1219. const VU shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B..
  1220. const VU shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A
  1221. const VU vC0 = TableLookupBytesOr0(v0, shuf_C0);
  1222. const VU vC1 = TableLookupBytesOr0(v1, shuf_C1);
  1223. const VU vC2 = TableLookupBytesOr0(v2, shuf_C2);
  1224. const VFromD<D> C = BitCast(d, vC0 | vC1 | vC2);
  1225. detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
  1226. }
  1227. // >= 128-bit vector, 16-bit lanes
  1228. template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_GT_D(D, 8)>
  1229. HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
  1230. TFromD<D>* HWY_RESTRICT unaligned) {
  1231. const Repartition<uint8_t, decltype(d)> du8;
  1232. using VU8 = VFromD<decltype(du8)>;
  1233. const VU8 k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
  1234. const VU8 k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});
  1235. // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
  1236. // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
  1237. // filled from other vectors are 0 for blending. Note that these are byte
  1238. // indices for 16-bit lanes.
  1239. const VFromD<decltype(du8)> shuf_A1 =
  1240. Dup128VecFromValues(du8, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3,
  1241. 0x80, 0x80, 0x80, 0x80, 4, 5);
  1242. const VFromD<decltype(du8)> shuf_A2 =
  1243. Dup128VecFromValues(du8, 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80,
  1244. 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80);
  1245. // The interleaved vectors will be named A, B, C; temporaries with suffix
  1246. // 0..2 indicate which input vector's lanes they hold.
  1247. const VU8 shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
  1248. const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0);
  1249. const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1);
  1250. const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2);
  1251. const VFromD<D> A = BitCast(d, A0 | A1 | A2);
  1252. // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2]
  1253. const VU8 shuf_B0 = shuf_A1 + k3; // 5..4..3.
  1254. const VU8 shuf_B1 = shuf_A2 + k3; // ..4..3..
  1255. const VU8 shuf_B2 = shuf_A0 + k2; // .4..3..2
  1256. const VU8 vB0 = TableLookupBytesOr0(v0, shuf_B0);
  1257. const VU8 vB1 = TableLookupBytesOr0(v1, shuf_B1);
  1258. const VU8 vB2 = TableLookupBytesOr0(v2, shuf_B2);
  1259. const VFromD<D> B = BitCast(d, vB0 | vB1 | vB2);
  1260. // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
  1261. const VU8 shuf_C0 = shuf_B1 + k3; // ..7..6..
  1262. const VU8 shuf_C1 = shuf_B2 + k3; // .7..6..5
  1263. const VU8 shuf_C2 = shuf_B0 + k2; // 7..6..5.
  1264. const VU8 vC0 = TableLookupBytesOr0(v0, shuf_C0);
  1265. const VU8 vC1 = TableLookupBytesOr0(v1, shuf_C1);
  1266. const VU8 vC2 = TableLookupBytesOr0(v2, shuf_C2);
  1267. const VFromD<D> C = BitCast(d, vC0 | vC1 | vC2);
  1268. detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
  1269. }
  1270. // >= 128-bit vector, 32-bit lanes
  1271. template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_GT_D(D, 8)>
  1272. HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
  1273. TFromD<D>* HWY_RESTRICT unaligned) {
  1274. const RepartitionToWide<decltype(d)> dw;
  1275. const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1);
  1276. const VFromD<D> v01_v20 = OddEven(v0, v2);
  1277. // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0)
  1278. const VFromD<D> A = BitCast(
  1279. d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20)));
  1280. const VFromD<D> v1_321 = ShiftRightLanes<1>(d, v1);
  1281. const VFromD<D> v0_32 = ShiftRightLanes<2>(d, v0);
  1282. const VFromD<D> v21_v11 = OddEven(v2, v1_321);
  1283. const VFromD<D> v12_v02 = OddEven(v1_321, v0_32);
  1284. // B: v1[2],v0[2], v2[1],v1[1]
  1285. const VFromD<D> B = BitCast(
  1286. d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02)));
  1287. // Notation refers to the upper 2 lanes of the vector for InterleaveUpper.
  1288. const VFromD<D> v23_v13 = OddEven(v2, v1_321);
  1289. const VFromD<D> v03_v22 = OddEven(v0, v2);
  1290. // C: v2[3],v1[3],v0[3], v2[2]
  1291. const VFromD<D> C = BitCast(
  1292. d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13)));
  1293. detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
  1294. }
  1295. // >= 128-bit vector, 64-bit lanes
  1296. template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
  1297. HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
  1298. TFromD<D>* HWY_RESTRICT unaligned) {
  1299. const VFromD<D> A = InterleaveLower(d, v0, v1);
  1300. const VFromD<D> B = OddEven(v0, v2);
  1301. const VFromD<D> C = InterleaveUpper(d, v1, v2);
  1302. detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
  1303. }
  1304. // 64-bit vector, 8-bit lanes
  1305. template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 8)>
  1306. HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
  1307. VFromD<D> part2, D d,
  1308. TFromD<D>* HWY_RESTRICT unaligned) {
  1309. // Use full vectors for the shuffles and first result.
  1310. constexpr size_t kFullN = 16 / sizeof(TFromD<D>);
  1311. const Full128<uint8_t> du;
  1312. using VU = VFromD<decltype(du)>;
  1313. const Full128<TFromD<D>> d_full;
  1314. const VU k5 = Set(du, uint8_t{5});
  1315. const VU k6 = Set(du, uint8_t{6});
  1316. const VFromD<decltype(d_full)> v0{part0.raw};
  1317. const VFromD<decltype(d_full)> v1{part1.raw};
  1318. const VFromD<decltype(d_full)> v2{part2.raw};
  1319. // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
  1320. // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
  1321. // filled from other vectors are 0 for blending.
  1322. alignas(16) static constexpr uint8_t tbl_v0[16] = {
  1323. 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
  1324. 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
  1325. alignas(16) static constexpr uint8_t tbl_v1[16] = {
  1326. 0x80, 0, 0x80, 0x80, 1, 0x80, //
  1327. 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
  1328. // The interleaved vectors will be named A, B, C; temporaries with suffix
  1329. // 0..2 indicate which input vector's lanes they hold.
  1330. const VU shuf_A0 = Load(du, tbl_v0);
  1331. const VU shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB)
  1332. const VU shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
  1333. const VU A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
  1334. const VU A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
  1335. const VU A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
  1336. const auto A = BitCast(d_full, A0 | A1 | A2);
  1337. StoreU(A, d_full, unaligned + 0 * kFullN);
  1338. // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
  1339. const VU shuf_B0 = shuf_A2 + k6; // ..7..6..
  1340. const VU shuf_B1 = shuf_A0 + k5; // .7..6..5
  1341. const VU shuf_B2 = shuf_A1 + k5; // 7..6..5.
  1342. const VU vB0 = TableLookupBytesOr0(v0, shuf_B0);
  1343. const VU vB1 = TableLookupBytesOr0(v1, shuf_B1);
  1344. const VU vB2 = TableLookupBytesOr0(v2, shuf_B2);
  1345. const VFromD<D> B{BitCast(d_full, vB0 | vB1 | vB2).raw};
  1346. StoreU(B, d, unaligned + 1 * kFullN);
  1347. }
  1348. // 64-bit vector, 16-bit lanes
  1349. template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 4)>
  1350. HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
  1351. VFromD<D> part2, D dh,
  1352. TFromD<D>* HWY_RESTRICT unaligned) {
  1353. const Twice<D> d_full;
  1354. const Full128<uint8_t> du8;
  1355. using VU8 = VFromD<decltype(du8)>;
  1356. const VU8 k2 = Set(du8, uint8_t{2 * sizeof(TFromD<D>)});
  1357. const VU8 k3 = Set(du8, uint8_t{3 * sizeof(TFromD<D>)});
  1358. const VFromD<decltype(d_full)> v0{part0.raw};
  1359. const VFromD<decltype(d_full)> v1{part1.raw};
  1360. const VFromD<decltype(d_full)> v2{part2.raw};
  1361. // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right):
  1362. // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes
  1363. // to their place, with 0x80 so lanes to be filled from other vectors are 0
  1364. // to enable blending by ORing together.
  1365. alignas(16) static constexpr uint8_t tbl_v1[16] = {
  1366. 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
  1367. 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
  1368. alignas(16) static constexpr uint8_t tbl_v2[16] = {
  1369. 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
  1370. 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
  1371. // The interleaved vectors will be named A, B; temporaries with suffix
  1372. // 0..2 indicate which input vector's lanes they hold.
  1373. const VU8 shuf_A1 = Load(du8, tbl_v1); // 2..1..0.
  1374. // .2..1..0
  1375. const VU8 shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
  1376. const VU8 shuf_A2 = Load(du8, tbl_v2); // ..1..0..
  1377. const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0);
  1378. const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1);
  1379. const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2);
  1380. const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
  1381. StoreU(A, d_full, unaligned);
  1382. // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2]
  1383. const VU8 shuf_B0 = shuf_A1 + k3; // ..3.
  1384. const VU8 shuf_B1 = shuf_A2 + k3; // .3..
  1385. const VU8 shuf_B2 = shuf_A0 + k2; // 3..2
  1386. const VU8 vB0 = TableLookupBytesOr0(v0, shuf_B0);
  1387. const VU8 vB1 = TableLookupBytesOr0(v1, shuf_B1);
  1388. const VU8 vB2 = TableLookupBytesOr0(v2, shuf_B2);
  1389. const VFromD<decltype(d_full)> B = BitCast(d_full, vB0 | vB1 | vB2);
  1390. StoreU(VFromD<D>{B.raw}, dh, unaligned + MaxLanes(d_full));
  1391. }
  1392. // 64-bit vector, 32-bit lanes
  1393. template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_LANES_D(D, 2)>
  1394. HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
  1395. TFromD<D>* HWY_RESTRICT unaligned) {
  1396. // (same code as 128-bit vector, 64-bit lanes)
  1397. const VFromD<D> v10_v00 = InterleaveLower(d, v0, v1);
  1398. const VFromD<D> v01_v20 = OddEven(v0, v2);
  1399. const VFromD<D> v21_v11 = InterleaveUpper(d, v1, v2);
  1400. constexpr size_t kN = MaxLanes(d);
  1401. StoreU(v10_v00, d, unaligned + 0 * kN);
  1402. StoreU(v01_v20, d, unaligned + 1 * kN);
  1403. StoreU(v21_v11, d, unaligned + 2 * kN);
  1404. }
  1405. // 64-bit lanes are handled by the N=1 case below.
  1406. // <= 32-bit vector, 8-bit lanes
  1407. template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 4),
  1408. HWY_IF_LANES_GT_D(D, 1)>
  1409. HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
  1410. VFromD<D> part2, D d,
  1411. TFromD<D>* HWY_RESTRICT unaligned) {
  1412. // Use full vectors for the shuffles and result.
  1413. const Full128<uint8_t> du;
  1414. using VU = VFromD<decltype(du)>;
  1415. const Full128<TFromD<D>> d_full;
  1416. const VFromD<decltype(d_full)> v0{part0.raw};
  1417. const VFromD<decltype(d_full)> v1{part1.raw};
  1418. const VFromD<decltype(d_full)> v2{part2.raw};
  1419. // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
  1420. // so lanes to be filled from other vectors are 0 to enable blending by ORing
  1421. // together.
  1422. alignas(16) static constexpr uint8_t tbl_v0[16] = {
  1423. 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80,
  1424. 0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
  1425. // The interleaved vector will be named A; temporaries with suffix
  1426. // 0..2 indicate which input vector's lanes they hold.
  1427. const VU shuf_A0 = Load(du, tbl_v0);
  1428. const VU shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
  1429. const VU shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
  1430. const VU A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0
  1431. const VU A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0.
  1432. const VU A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0..
  1433. const VFromD<decltype(d_full)> A = BitCast(d_full, A0 | A1 | A2);
  1434. alignas(16) TFromD<D> buf[MaxLanes(d_full)];
  1435. StoreU(A, d_full, buf);
  1436. CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
  1437. }
  1438. // 32-bit vector, 16-bit lanes
  1439. template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_LANES_D(D, 2)>
  1440. HWY_API void StoreInterleaved3(VFromD<D> part0, VFromD<D> part1,
  1441. VFromD<D> part2, D d,
  1442. TFromD<D>* HWY_RESTRICT unaligned) {
  1443. // Use full vectors for the shuffles and result.
  1444. const Full128<uint8_t> du8;
  1445. using VU8 = VFromD<decltype(du8)>;
  1446. const Full128<TFromD<D>> d_full;
  1447. const VFromD<decltype(d_full)> v0{part0.raw};
  1448. const VFromD<decltype(d_full)> v1{part1.raw};
  1449. const VFromD<decltype(d_full)> v2{part2.raw};
  1450. // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
  1451. // so lanes to be filled from other vectors are 0 to enable blending by ORing
  1452. // together.
  1453. alignas(16) static constexpr uint8_t tbl_v2[16] = {
  1454. 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
  1455. 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
  1456. // The interleaved vector will be named A; temporaries with suffix
  1457. // 0..2 indicate which input vector's lanes they hold.
  1458. const VU8 shuf_A2 = Load(du8, tbl_v2); // ..1..0..
  1459. const VU8 shuf_A1 =
  1460. CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2); // ...1..0.
  1461. const VU8 shuf_A0 =
  1462. CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2); // ....1..0
  1463. const VU8 A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0
  1464. const VU8 A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0.
  1465. const VU8 A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0..
  1466. const auto A = BitCast(d_full, A0 | A1 | A2);
  1467. alignas(16) TFromD<D> buf[MaxLanes(d_full)];
  1468. StoreU(A, d_full, buf);
  1469. CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
  1470. }
  1471. // Single-element vector, any lane size: just store directly
  1472. template <class D, HWY_IF_LANES_D(D, 1)>
  1473. HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
  1474. TFromD<D>* HWY_RESTRICT unaligned) {
  1475. StoreU(v0, d, unaligned + 0);
  1476. StoreU(v1, d, unaligned + 1);
  1477. StoreU(v2, d, unaligned + 2);
  1478. }
  1479. // ------------------------------ StoreInterleaved4
  1480. namespace detail {
  1481. // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
  1482. template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  1483. HWY_INLINE void StoreTransposedBlocks4(VFromD<D> vA, VFromD<D> vB, VFromD<D> vC,
  1484. VFromD<D> vD, D d,
  1485. TFromD<D>* HWY_RESTRICT unaligned) {
  1486. constexpr size_t kN = MaxLanes(d);
  1487. StoreU(vA, d, unaligned + 0 * kN);
  1488. StoreU(vB, d, unaligned + 1 * kN);
  1489. StoreU(vC, d, unaligned + 2 * kN);
  1490. StoreU(vD, d, unaligned + 3 * kN);
  1491. }
  1492. } // namespace detail
  1493. // >= 128-bit vector, 8..32-bit lanes
  1494. template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
  1495. HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
  1496. VFromD<D> v3, D d,
  1497. TFromD<D>* HWY_RESTRICT unaligned) {
  1498. const RepartitionToWide<decltype(d)> dw;
  1499. const auto v10L = ZipLower(dw, v0, v1); // .. v1[0] v0[0]
  1500. const auto v32L = ZipLower(dw, v2, v3);
  1501. const auto v10U = ZipUpper(dw, v0, v1);
  1502. const auto v32U = ZipUpper(dw, v2, v3);
  1503. // The interleaved vectors are vA, vB, vC, vD.
  1504. const VFromD<D> vA = BitCast(d, InterleaveLower(dw, v10L, v32L)); // 3210
  1505. const VFromD<D> vB = BitCast(d, InterleaveUpper(dw, v10L, v32L));
  1506. const VFromD<D> vC = BitCast(d, InterleaveLower(dw, v10U, v32U));
  1507. const VFromD<D> vD = BitCast(d, InterleaveUpper(dw, v10U, v32U));
  1508. detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned);
  1509. }
  1510. // >= 128-bit vector, 64-bit lanes
  1511. template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_GT_D(D, 8)>
  1512. HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
  1513. VFromD<D> v3, D d,
  1514. TFromD<D>* HWY_RESTRICT unaligned) {
  1515. // The interleaved vectors are vA, vB, vC, vD.
  1516. const VFromD<D> vA = InterleaveLower(d, v0, v1); // v1[0] v0[0]
  1517. const VFromD<D> vB = InterleaveLower(d, v2, v3);
  1518. const VFromD<D> vC = InterleaveUpper(d, v0, v1);
  1519. const VFromD<D> vD = InterleaveUpper(d, v2, v3);
  1520. detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned);
  1521. }
  1522. // 64-bit vector, 8..32-bit lanes
  1523. template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
  1524. HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
  1525. VFromD<D> part2, VFromD<D> part3, D /* tag */,
  1526. TFromD<D>* HWY_RESTRICT unaligned) {
  1527. // Use full vectors to reduce the number of stores.
  1528. const Full128<TFromD<D>> d_full;
  1529. const RepartitionToWide<decltype(d_full)> dw;
  1530. const VFromD<decltype(d_full)> v0{part0.raw};
  1531. const VFromD<decltype(d_full)> v1{part1.raw};
  1532. const VFromD<decltype(d_full)> v2{part2.raw};
  1533. const VFromD<decltype(d_full)> v3{part3.raw};
  1534. const auto v10 = ZipLower(dw, v0, v1); // v1[0] v0[0]
  1535. const auto v32 = ZipLower(dw, v2, v3);
  1536. const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32));
  1537. const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32));
  1538. StoreU(A, d_full, unaligned);
  1539. StoreU(B, d_full, unaligned + MaxLanes(d_full));
  1540. }
  1541. // 64-bit vector, 64-bit lane
  1542. template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
  1543. HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
  1544. VFromD<D> part2, VFromD<D> part3, D /* tag */,
  1545. TFromD<D>* HWY_RESTRICT unaligned) {
  1546. // Use full vectors to reduce the number of stores.
  1547. const Full128<TFromD<D>> d_full;
  1548. const VFromD<decltype(d_full)> v0{part0.raw};
  1549. const VFromD<decltype(d_full)> v1{part1.raw};
  1550. const VFromD<decltype(d_full)> v2{part2.raw};
  1551. const VFromD<decltype(d_full)> v3{part3.raw};
  1552. const auto A = InterleaveLower(d_full, v0, v1); // v1[0] v0[0]
  1553. const auto B = InterleaveLower(d_full, v2, v3);
  1554. StoreU(A, d_full, unaligned);
  1555. StoreU(B, d_full, unaligned + MaxLanes(d_full));
  1556. }
  1557. // <= 32-bit vectors
  1558. template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
  1559. HWY_API void StoreInterleaved4(VFromD<D> part0, VFromD<D> part1,
  1560. VFromD<D> part2, VFromD<D> part3, D d,
  1561. TFromD<D>* HWY_RESTRICT unaligned) {
  1562. // Use full vectors to reduce the number of stores.
  1563. const Full128<TFromD<D>> d_full;
  1564. const RepartitionToWide<decltype(d_full)> dw;
  1565. const VFromD<decltype(d_full)> v0{part0.raw};
  1566. const VFromD<decltype(d_full)> v1{part1.raw};
  1567. const VFromD<decltype(d_full)> v2{part2.raw};
  1568. const VFromD<decltype(d_full)> v3{part3.raw};
  1569. const auto v10 = ZipLower(dw, v0, v1); // .. v1[0] v0[0]
  1570. const auto v32 = ZipLower(dw, v2, v3);
  1571. const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32));
  1572. alignas(16) TFromD<D> buf[MaxLanes(d_full)];
  1573. StoreU(v3210, d_full, buf);
  1574. CopyBytes<d.MaxBytes() * 4>(buf, unaligned);
  1575. }
  1576. #endif // HWY_NATIVE_LOAD_STORE_INTERLEAVED
  1577. // ------------------------------ LoadN
  1578. #if (defined(HWY_NATIVE_LOAD_N) == defined(HWY_TARGET_TOGGLE))
  1579. #ifdef HWY_NATIVE_LOAD_N
  1580. #undef HWY_NATIVE_LOAD_N
  1581. #else
  1582. #define HWY_NATIVE_LOAD_N
  1583. #endif
  1584. #if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
  1585. namespace detail {
  1586. template <class DTo, class DFrom>
  1587. HWY_INLINE VFromD<DTo> LoadNResizeBitCast(DTo d_to, DFrom d_from,
  1588. VFromD<DFrom> v) {
  1589. #if HWY_TARGET <= HWY_SSE2
  1590. // On SSE2/SSSE3/SSE4, the LoadU operation will zero out any lanes of v.raw
  1591. // past the first (lowest-index) Lanes(d_from) lanes of v.raw if
  1592. // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true
  1593. (void)d_from;
  1594. return ResizeBitCast(d_to, v);
  1595. #else
  1596. // On other targets such as PPC/NEON, the contents of any lanes past the first
  1597. // (lowest-index) Lanes(d_from) lanes of v.raw might be non-zero if
  1598. // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true.
  1599. return ZeroExtendResizeBitCast(d_to, d_from, v);
  1600. #endif
  1601. }
  1602. } // namespace detail
  1603. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
  1604. HWY_IF_NOT_BF16_D(D)>
  1605. HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
  1606. size_t num_lanes) {
  1607. return (num_lanes > 0) ? LoadU(d, p) : Zero(d);
  1608. }
  1609. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
  1610. HWY_IF_NOT_BF16_D(D)>
  1611. HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
  1612. size_t num_lanes) {
  1613. return (num_lanes > 0) ? LoadU(d, p) : no;
  1614. }
  1615. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
  1616. HWY_IF_NOT_BF16_D(D)>
  1617. HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
  1618. size_t num_lanes) {
  1619. const FixedTag<TFromD<D>, 1> d1;
  1620. if (num_lanes >= 2) return LoadU(d, p);
  1621. if (num_lanes == 0) return Zero(d);
  1622. return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
  1623. }
  1624. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
  1625. HWY_IF_NOT_BF16_D(D)>
  1626. HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
  1627. size_t num_lanes) {
  1628. const FixedTag<TFromD<D>, 1> d1;
  1629. if (num_lanes >= 2) return LoadU(d, p);
  1630. if (num_lanes == 0) return no;
  1631. return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no);
  1632. }
  1633. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
  1634. HWY_IF_NOT_BF16_D(D)>
  1635. HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
  1636. size_t num_lanes) {
  1637. const FixedTag<TFromD<D>, 2> d2;
  1638. const Half<decltype(d2)> d1;
  1639. if (num_lanes >= 4) return LoadU(d, p);
  1640. if (num_lanes == 0) return Zero(d);
  1641. if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
  1642. // Two or three lanes.
  1643. const VFromD<D> v_lo = detail::LoadNResizeBitCast(d, d2, LoadU(d2, p));
  1644. return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
  1645. }
  1646. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
  1647. HWY_IF_NOT_BF16_D(D)>
  1648. HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
  1649. size_t num_lanes) {
  1650. const FixedTag<TFromD<D>, 2> d2;
  1651. if (num_lanes >= 4) return LoadU(d, p);
  1652. if (num_lanes == 0) return no;
  1653. if (num_lanes == 1) return InsertLane(no, 0, p[0]);
  1654. // Two or three lanes.
  1655. const VFromD<D> v_lo =
  1656. ConcatUpperLower(d, no, ResizeBitCast(d, LoadU(d2, p)));
  1657. return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]);
  1658. }
  1659. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
  1660. HWY_IF_NOT_BF16_D(D)>
  1661. HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
  1662. size_t num_lanes) {
  1663. const FixedTag<TFromD<D>, 4> d4;
  1664. const Half<decltype(d4)> d2;
  1665. const Half<decltype(d2)> d1;
  1666. if (num_lanes >= 8) return LoadU(d, p);
  1667. if (num_lanes == 0) return Zero(d);
  1668. if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
  1669. const size_t leading_len = num_lanes & 4;
  1670. VFromD<decltype(d4)> v_trailing = Zero(d4);
  1671. if ((num_lanes & 2) != 0) {
  1672. const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
  1673. if ((num_lanes & 1) != 0) {
  1674. v_trailing = Combine(
  1675. d4,
  1676. detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)),
  1677. v_trailing_lo2);
  1678. } else {
  1679. v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2);
  1680. }
  1681. } else if ((num_lanes & 1) != 0) {
  1682. v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len));
  1683. }
  1684. if (leading_len != 0) {
  1685. return Combine(d, v_trailing, LoadU(d4, p));
  1686. } else {
  1687. return detail::LoadNResizeBitCast(d, d4, v_trailing);
  1688. }
  1689. }
  1690. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
  1691. HWY_IF_NOT_BF16_D(D)>
  1692. HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
  1693. size_t num_lanes) {
  1694. const FixedTag<TFromD<D>, 4> d4;
  1695. const Half<decltype(d4)> d2;
  1696. const Half<decltype(d2)> d1;
  1697. if (num_lanes >= 8) return LoadU(d, p);
  1698. if (num_lanes == 0) return no;
  1699. if (num_lanes == 1) return InsertLane(no, 0, p[0]);
  1700. const size_t leading_len = num_lanes & 4;
  1701. VFromD<decltype(d4)> v_trailing = ResizeBitCast(d4, no);
  1702. if ((num_lanes & 2) != 0) {
  1703. const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
  1704. if ((num_lanes & 1) != 0) {
  1705. v_trailing = Combine(
  1706. d4,
  1707. InterleaveLower(ResizeBitCast(d2, LoadU(d1, p + leading_len + 2)),
  1708. ResizeBitCast(d2, no)),
  1709. v_trailing_lo2);
  1710. } else {
  1711. v_trailing = ConcatUpperLower(d4, ResizeBitCast(d4, no),
  1712. ResizeBitCast(d4, v_trailing_lo2));
  1713. }
  1714. } else if ((num_lanes & 1) != 0) {
  1715. v_trailing = InsertLane(ResizeBitCast(d4, no), 0, p[leading_len]);
  1716. }
  1717. if (leading_len != 0) {
  1718. return Combine(d, v_trailing, LoadU(d4, p));
  1719. } else {
  1720. return ConcatUpperLower(d, no, ResizeBitCast(d, v_trailing));
  1721. }
  1722. }
  1723. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
  1724. HWY_IF_NOT_BF16_D(D)>
  1725. HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
  1726. size_t num_lanes) {
  1727. const FixedTag<TFromD<D>, 8> d8;
  1728. const Half<decltype(d8)> d4;
  1729. const Half<decltype(d4)> d2;
  1730. const Half<decltype(d2)> d1;
  1731. if (num_lanes >= 16) return LoadU(d, p);
  1732. if (num_lanes == 0) return Zero(d);
  1733. if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p));
  1734. const size_t leading_len = num_lanes & 12;
  1735. VFromD<decltype(d4)> v_trailing = Zero(d4);
  1736. if ((num_lanes & 2) != 0) {
  1737. const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
  1738. if ((num_lanes & 1) != 0) {
  1739. v_trailing = Combine(
  1740. d4,
  1741. detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)),
  1742. v_trailing_lo2);
  1743. } else {
  1744. v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2);
  1745. }
  1746. } else if ((num_lanes & 1) != 0) {
  1747. v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len));
  1748. }
  1749. if (leading_len != 0) {
  1750. if (leading_len >= 8) {
  1751. const VFromD<decltype(d8)> v_hi7 =
  1752. ((leading_len & 4) != 0)
  1753. ? Combine(d8, v_trailing, LoadU(d4, p + 8))
  1754. : detail::LoadNResizeBitCast(d8, d4, v_trailing);
  1755. return Combine(d, v_hi7, LoadU(d8, p));
  1756. } else {
  1757. return detail::LoadNResizeBitCast(d, d8,
  1758. Combine(d8, v_trailing, LoadU(d4, p)));
  1759. }
  1760. } else {
  1761. return detail::LoadNResizeBitCast(d, d4, v_trailing);
  1762. }
  1763. }
  1764. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
  1765. HWY_IF_NOT_BF16_D(D)>
  1766. HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
  1767. size_t num_lanes) {
  1768. const FixedTag<TFromD<D>, 8> d8;
  1769. const Half<decltype(d8)> d4;
  1770. const Half<decltype(d4)> d2;
  1771. const Half<decltype(d2)> d1;
  1772. if (num_lanes >= 16) return LoadU(d, p);
  1773. if (num_lanes == 0) return no;
  1774. if (num_lanes == 1) return InsertLane(no, 0, p[0]);
  1775. const size_t leading_len = num_lanes & 12;
  1776. VFromD<decltype(d4)> v_trailing = ResizeBitCast(d4, no);
  1777. if ((num_lanes & 2) != 0) {
  1778. const VFromD<decltype(d2)> v_trailing_lo2 = LoadU(d2, p + leading_len);
  1779. if ((num_lanes & 1) != 0) {
  1780. v_trailing = Combine(
  1781. d4,
  1782. InterleaveLower(ResizeBitCast(d2, LoadU(d1, p + leading_len + 2)),
  1783. ResizeBitCast(d2, no)),
  1784. v_trailing_lo2);
  1785. } else {
  1786. v_trailing = ConcatUpperLower(d4, ResizeBitCast(d4, no),
  1787. ResizeBitCast(d4, v_trailing_lo2));
  1788. }
  1789. } else if ((num_lanes & 1) != 0) {
  1790. v_trailing = InsertLane(ResizeBitCast(d4, no), 0, p[leading_len]);
  1791. }
  1792. if (leading_len != 0) {
  1793. if (leading_len >= 8) {
  1794. const VFromD<decltype(d8)> v_hi7 =
  1795. ((leading_len & 4) != 0)
  1796. ? Combine(d8, v_trailing, LoadU(d4, p + 8))
  1797. : ConcatUpperLower(d8, ResizeBitCast(d8, no),
  1798. ResizeBitCast(d8, v_trailing));
  1799. return Combine(d, v_hi7, LoadU(d8, p));
  1800. } else {
  1801. return ConcatUpperLower(
  1802. d, ResizeBitCast(d, no),
  1803. ResizeBitCast(d, Combine(d8, v_trailing, LoadU(d4, p))));
  1804. }
  1805. } else {
  1806. const Repartition<uint32_t, D> du32;
  1807. // lowest 4 bytes from v_trailing, next 4 from no.
  1808. const VFromD<decltype(du32)> lo8 =
  1809. InterleaveLower(ResizeBitCast(du32, v_trailing), BitCast(du32, no));
  1810. return ConcatUpperLower(d, ResizeBitCast(d, no), ResizeBitCast(d, lo8));
  1811. }
  1812. }
  1813. #if HWY_MAX_BYTES >= 32
  1814. template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
  1815. HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
  1816. size_t num_lanes) {
  1817. if (num_lanes >= Lanes(d)) return LoadU(d, p);
  1818. const Half<decltype(d)> dh;
  1819. const size_t half_N = Lanes(dh);
  1820. if (num_lanes <= half_N) {
  1821. return ZeroExtendVector(d, LoadN(dh, p, num_lanes));
  1822. } else {
  1823. const VFromD<decltype(dh)> v_lo = LoadU(dh, p);
  1824. const VFromD<decltype(dh)> v_hi = LoadN(dh, p + half_N, num_lanes - half_N);
  1825. return Combine(d, v_hi, v_lo);
  1826. }
  1827. }
  1828. template <class D, HWY_IF_V_SIZE_GT_D(D, 16), HWY_IF_NOT_BF16_D(D)>
  1829. HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
  1830. size_t num_lanes) {
  1831. if (num_lanes >= Lanes(d)) return LoadU(d, p);
  1832. const Half<decltype(d)> dh;
  1833. const size_t half_N = Lanes(dh);
  1834. const VFromD<decltype(dh)> no_h = LowerHalf(no);
  1835. if (num_lanes <= half_N) {
  1836. return ConcatUpperLower(d, no,
  1837. ResizeBitCast(d, LoadNOr(no_h, dh, p, num_lanes)));
  1838. } else {
  1839. const VFromD<decltype(dh)> v_lo = LoadU(dh, p);
  1840. const VFromD<decltype(dh)> v_hi =
  1841. LoadNOr(no_h, dh, p + half_N, num_lanes - half_N);
  1842. return Combine(d, v_hi, v_lo);
  1843. }
  1844. }
  1845. #endif // HWY_MAX_BYTES >= 32
  1846. template <class D, HWY_IF_BF16_D(D)>
  1847. HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
  1848. size_t num_lanes) {
  1849. const RebindToUnsigned<D> du;
  1850. return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes));
  1851. }
  1852. template <class D, HWY_IF_BF16_D(D)>
  1853. HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
  1854. size_t num_lanes) {
  1855. const RebindToUnsigned<D> du;
  1856. return BitCast(
  1857. d, LoadNOr(BitCast(du, no), du, detail::U16LanePointer(p), num_lanes));
  1858. }
  1859. #else // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE
  1860. // For SVE and non-sanitizer AVX-512; RVV has its own specialization.
  1861. template <class D>
  1862. HWY_API VFromD<D> LoadN(D d, const TFromD<D>* HWY_RESTRICT p,
  1863. size_t num_lanes) {
  1864. #if HWY_MEM_OPS_MIGHT_FAULT
  1865. if (num_lanes <= 0) return Zero(d);
  1866. #endif
  1867. return MaskedLoad(FirstN(d, num_lanes), d, p);
  1868. }
  1869. template <class D>
  1870. HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
  1871. size_t num_lanes) {
  1872. #if HWY_MEM_OPS_MIGHT_FAULT
  1873. if (num_lanes <= 0) return no;
  1874. #endif
  1875. return MaskedLoadOr(no, FirstN(d, num_lanes), d, p);
  1876. }
  1877. #endif // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
  1878. #endif // HWY_NATIVE_LOAD_N
  1879. // ------------------------------ StoreN
  1880. #if (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE))
  1881. #ifdef HWY_NATIVE_STORE_N
  1882. #undef HWY_NATIVE_STORE_N
  1883. #else
  1884. #define HWY_NATIVE_STORE_N
  1885. #endif
  1886. #if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
  1887. namespace detail {
  1888. template <class DH, HWY_IF_V_SIZE_LE_D(DH, 4)>
  1889. HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) {
  1890. constexpr size_t kMinShrVectBytes = HWY_TARGET_IS_NEON ? 8 : 16;
  1891. const FixedTag<uint8_t, kMinShrVectBytes> d_shift;
  1892. return ResizeBitCast(
  1893. dh, ShiftRightBytes<dh.MaxBytes()>(d_shift, ResizeBitCast(d_shift, v)));
  1894. }
  1895. template <class DH, HWY_IF_V_SIZE_GT_D(DH, 4)>
  1896. HWY_INLINE VFromD<DH> StoreNGetUpperHalf(DH dh, VFromD<Twice<DH>> v) {
  1897. return UpperHalf(dh, v);
  1898. }
  1899. } // namespace detail
  1900. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 1),
  1901. typename T = TFromD<D>>
  1902. HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
  1903. size_t max_lanes_to_store) {
  1904. if (max_lanes_to_store > 0) {
  1905. StoreU(v, d, p);
  1906. }
  1907. }
  1908. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2),
  1909. typename T = TFromD<D>>
  1910. HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
  1911. size_t max_lanes_to_store) {
  1912. if (max_lanes_to_store > 1) {
  1913. StoreU(v, d, p);
  1914. } else if (max_lanes_to_store == 1) {
  1915. const FixedTag<TFromD<D>, 1> d1;
  1916. StoreU(LowerHalf(d1, v), d1, p);
  1917. }
  1918. }
  1919. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4),
  1920. typename T = TFromD<D>>
  1921. HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
  1922. size_t max_lanes_to_store) {
  1923. const FixedTag<TFromD<D>, 2> d2;
  1924. const Half<decltype(d2)> d1;
  1925. if (max_lanes_to_store > 1) {
  1926. if (max_lanes_to_store >= 4) {
  1927. StoreU(v, d, p);
  1928. } else {
  1929. StoreU(ResizeBitCast(d2, v), d2, p);
  1930. if (max_lanes_to_store == 3) {
  1931. StoreU(ResizeBitCast(d1, detail::StoreNGetUpperHalf(d2, v)), d1, p + 2);
  1932. }
  1933. }
  1934. } else if (max_lanes_to_store == 1) {
  1935. StoreU(ResizeBitCast(d1, v), d1, p);
  1936. }
  1937. }
  1938. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8),
  1939. typename T = TFromD<D>>
  1940. HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
  1941. size_t max_lanes_to_store) {
  1942. const FixedTag<TFromD<D>, 4> d4;
  1943. const Half<decltype(d4)> d2;
  1944. const Half<decltype(d2)> d1;
  1945. if (max_lanes_to_store <= 1) {
  1946. if (max_lanes_to_store == 1) {
  1947. StoreU(ResizeBitCast(d1, v), d1, p);
  1948. }
  1949. } else if (max_lanes_to_store >= 8) {
  1950. StoreU(v, d, p);
  1951. } else if (max_lanes_to_store >= 4) {
  1952. StoreU(LowerHalf(d4, v), d4, p);
  1953. StoreN(detail::StoreNGetUpperHalf(d4, v), d4, p + 4,
  1954. max_lanes_to_store - 4);
  1955. } else {
  1956. StoreN(LowerHalf(d4, v), d4, p, max_lanes_to_store);
  1957. }
  1958. }
  1959. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 16),
  1960. typename T = TFromD<D>>
  1961. HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
  1962. size_t max_lanes_to_store) {
  1963. const FixedTag<TFromD<D>, 8> d8;
  1964. const Half<decltype(d8)> d4;
  1965. const Half<decltype(d4)> d2;
  1966. const Half<decltype(d2)> d1;
  1967. if (max_lanes_to_store <= 1) {
  1968. if (max_lanes_to_store == 1) {
  1969. StoreU(ResizeBitCast(d1, v), d1, p);
  1970. }
  1971. } else if (max_lanes_to_store >= 16) {
  1972. StoreU(v, d, p);
  1973. } else if (max_lanes_to_store >= 8) {
  1974. StoreU(LowerHalf(d8, v), d8, p);
  1975. StoreN(detail::StoreNGetUpperHalf(d8, v), d8, p + 8,
  1976. max_lanes_to_store - 8);
  1977. } else {
  1978. StoreN(LowerHalf(d8, v), d8, p, max_lanes_to_store);
  1979. }
  1980. }
  1981. #if HWY_MAX_BYTES >= 32
  1982. template <class D, HWY_IF_V_SIZE_GT_D(D, 16), typename T = TFromD<D>>
  1983. HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
  1984. size_t max_lanes_to_store) {
  1985. const size_t N = Lanes(d);
  1986. if (max_lanes_to_store >= N) {
  1987. StoreU(v, d, p);
  1988. return;
  1989. }
  1990. const Half<decltype(d)> dh;
  1991. const size_t half_N = Lanes(dh);
  1992. if (max_lanes_to_store <= half_N) {
  1993. StoreN(LowerHalf(dh, v), dh, p, max_lanes_to_store);
  1994. } else {
  1995. StoreU(LowerHalf(dh, v), dh, p);
  1996. StoreN(UpperHalf(dh, v), dh, p + half_N, max_lanes_to_store - half_N);
  1997. }
  1998. }
  1999. #endif // HWY_MAX_BYTES >= 32
  2000. #else // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE
  2001. template <class D, typename T = TFromD<D>>
  2002. HWY_API void StoreN(VFromD<D> v, D d, T* HWY_RESTRICT p,
  2003. size_t max_lanes_to_store) {
  2004. const size_t N = Lanes(d);
  2005. const size_t clamped_max_lanes_to_store = HWY_MIN(max_lanes_to_store, N);
  2006. #if HWY_MEM_OPS_MIGHT_FAULT
  2007. if (clamped_max_lanes_to_store == 0) return;
  2008. #endif
  2009. BlendedStore(v, FirstN(d, clamped_max_lanes_to_store), d, p);
  2010. detail::MaybeUnpoison(p, clamped_max_lanes_to_store);
  2011. }
  2012. #endif // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE
  2013. #endif // (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE))
  2014. // ------------------------------ Scatter
  2015. #if (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))
  2016. #ifdef HWY_NATIVE_SCATTER
  2017. #undef HWY_NATIVE_SCATTER
  2018. #else
  2019. #define HWY_NATIVE_SCATTER
  2020. #endif
  2021. template <class D, typename T = TFromD<D>>
  2022. HWY_API void ScatterOffset(VFromD<D> v, D d, T* HWY_RESTRICT base,
  2023. VFromD<RebindToSigned<D>> offset) {
  2024. const RebindToSigned<decltype(d)> di;
  2025. using TI = TFromD<decltype(di)>;
  2026. static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
  2027. HWY_ALIGN T lanes[MaxLanes(d)];
  2028. Store(v, d, lanes);
  2029. HWY_ALIGN TI offset_lanes[MaxLanes(d)];
  2030. Store(offset, di, offset_lanes);
  2031. uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
  2032. for (size_t i = 0; i < MaxLanes(d); ++i) {
  2033. CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
  2034. }
  2035. }
  2036. template <class D, typename T = TFromD<D>>
  2037. HWY_API void ScatterIndex(VFromD<D> v, D d, T* HWY_RESTRICT base,
  2038. VFromD<RebindToSigned<D>> index) {
  2039. const RebindToSigned<decltype(d)> di;
  2040. using TI = TFromD<decltype(di)>;
  2041. static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
  2042. HWY_ALIGN T lanes[MaxLanes(d)];
  2043. Store(v, d, lanes);
  2044. HWY_ALIGN TI index_lanes[MaxLanes(d)];
  2045. Store(index, di, index_lanes);
  2046. for (size_t i = 0; i < MaxLanes(d); ++i) {
  2047. base[index_lanes[i]] = lanes[i];
  2048. }
  2049. }
  2050. template <class D, typename T = TFromD<D>>
  2051. HWY_API void MaskedScatterIndex(VFromD<D> v, MFromD<D> m, D d,
  2052. T* HWY_RESTRICT base,
  2053. VFromD<RebindToSigned<D>> index) {
  2054. const RebindToSigned<decltype(d)> di;
  2055. using TI = TFromD<decltype(di)>;
  2056. static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
  2057. HWY_ALIGN T lanes[MaxLanes(d)];
  2058. Store(v, d, lanes);
  2059. HWY_ALIGN TI index_lanes[MaxLanes(d)];
  2060. Store(index, di, index_lanes);
  2061. HWY_ALIGN TI mask_lanes[MaxLanes(di)];
  2062. Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);
  2063. for (size_t i = 0; i < MaxLanes(d); ++i) {
  2064. if (mask_lanes[i]) base[index_lanes[i]] = lanes[i];
  2065. }
  2066. }
  2067. template <class D, typename T = TFromD<D>>
  2068. HWY_API void ScatterIndexN(VFromD<D> v, D d, T* HWY_RESTRICT base,
  2069. VFromD<RebindToSigned<D>> index,
  2070. const size_t max_lanes_to_store) {
  2071. const RebindToSigned<decltype(d)> di;
  2072. using TI = TFromD<decltype(di)>;
  2073. static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
  2074. for (size_t i = 0; i < MaxLanes(d); ++i) {
  2075. if (i < max_lanes_to_store) base[ExtractLane(index, i)] = ExtractLane(v, i);
  2076. }
  2077. }
  2078. #else
  2079. template <class D, typename T = TFromD<D>>
  2080. HWY_API void ScatterIndexN(VFromD<D> v, D d, T* HWY_RESTRICT base,
  2081. VFromD<RebindToSigned<D>> index,
  2082. const size_t max_lanes_to_store) {
  2083. MaskedScatterIndex(v, FirstN(d, max_lanes_to_store), d, base, index);
  2084. }
  2085. #endif // (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE))
  2086. // ------------------------------ Gather
  2087. #if (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
  2088. #ifdef HWY_NATIVE_GATHER
  2089. #undef HWY_NATIVE_GATHER
  2090. #else
  2091. #define HWY_NATIVE_GATHER
  2092. #endif
  2093. template <class D, typename T = TFromD<D>>
  2094. HWY_API VFromD<D> GatherOffset(D d, const T* HWY_RESTRICT base,
  2095. VFromD<RebindToSigned<D>> offset) {
  2096. const RebindToSigned<D> di;
  2097. using TI = TFromD<decltype(di)>;
  2098. static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
  2099. HWY_ALIGN TI offset_lanes[MaxLanes(d)];
  2100. Store(offset, di, offset_lanes);
  2101. HWY_ALIGN T lanes[MaxLanes(d)];
  2102. const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
  2103. for (size_t i = 0; i < MaxLanes(d); ++i) {
  2104. HWY_DASSERT(offset_lanes[i] >= 0);
  2105. CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
  2106. }
  2107. return Load(d, lanes);
  2108. }
  2109. template <class D, typename T = TFromD<D>>
  2110. HWY_API VFromD<D> GatherIndex(D d, const T* HWY_RESTRICT base,
  2111. VFromD<RebindToSigned<D>> index) {
  2112. const RebindToSigned<D> di;
  2113. using TI = TFromD<decltype(di)>;
  2114. static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
  2115. HWY_ALIGN TI index_lanes[MaxLanes(d)];
  2116. Store(index, di, index_lanes);
  2117. HWY_ALIGN T lanes[MaxLanes(d)];
  2118. for (size_t i = 0; i < MaxLanes(d); ++i) {
  2119. HWY_DASSERT(index_lanes[i] >= 0);
  2120. lanes[i] = base[index_lanes[i]];
  2121. }
  2122. return Load(d, lanes);
  2123. }
  2124. template <class D, typename T = TFromD<D>>
  2125. HWY_API VFromD<D> MaskedGatherIndex(MFromD<D> m, D d,
  2126. const T* HWY_RESTRICT base,
  2127. VFromD<RebindToSigned<D>> index) {
  2128. const RebindToSigned<D> di;
  2129. using TI = TFromD<decltype(di)>;
  2130. static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
  2131. HWY_ALIGN TI index_lanes[MaxLanes(di)];
  2132. Store(index, di, index_lanes);
  2133. HWY_ALIGN TI mask_lanes[MaxLanes(di)];
  2134. Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);
  2135. HWY_ALIGN T lanes[MaxLanes(d)];
  2136. for (size_t i = 0; i < MaxLanes(d); ++i) {
  2137. HWY_DASSERT(index_lanes[i] >= 0);
  2138. lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : T{0};
  2139. }
  2140. return Load(d, lanes);
  2141. }
  2142. template <class D, typename T = TFromD<D>>
  2143. HWY_API VFromD<D> MaskedGatherIndexOr(VFromD<D> no, MFromD<D> m, D d,
  2144. const T* HWY_RESTRICT base,
  2145. VFromD<RebindToSigned<D>> index) {
  2146. const RebindToSigned<D> di;
  2147. using TI = TFromD<decltype(di)>;
  2148. static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
  2149. HWY_ALIGN TI index_lanes[MaxLanes(di)];
  2150. Store(index, di, index_lanes);
  2151. HWY_ALIGN TI mask_lanes[MaxLanes(di)];
  2152. Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes);
  2153. HWY_ALIGN T no_lanes[MaxLanes(d)];
  2154. Store(no, d, no_lanes);
  2155. HWY_ALIGN T lanes[MaxLanes(d)];
  2156. for (size_t i = 0; i < MaxLanes(d); ++i) {
  2157. HWY_DASSERT(index_lanes[i] >= 0);
  2158. lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : no_lanes[i];
  2159. }
  2160. return Load(d, lanes);
  2161. }
  2162. template <class D, typename T = TFromD<D>>
  2163. HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base,
  2164. VFromD<RebindToSigned<D>> index,
  2165. const size_t max_lanes_to_load) {
  2166. const RebindToSigned<D> di;
  2167. using TI = TFromD<decltype(di)>;
  2168. static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
  2169. VFromD<D> v = Zero(d);
  2170. for (size_t i = 0; i < HWY_MIN(MaxLanes(d), max_lanes_to_load); ++i) {
  2171. v = InsertLane(v, i, base[ExtractLane(index, i)]);
  2172. }
  2173. return v;
  2174. }
  2175. template <class D, typename T = TFromD<D>>
  2176. HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
  2177. VFromD<RebindToSigned<D>> index,
  2178. const size_t max_lanes_to_load) {
  2179. const RebindToSigned<D> di;
  2180. using TI = TFromD<decltype(di)>;
  2181. static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match");
  2182. VFromD<D> v = no;
  2183. for (size_t i = 0; i < HWY_MIN(MaxLanes(d), max_lanes_to_load); ++i) {
  2184. v = InsertLane(v, i, base[ExtractLane(index, i)]);
  2185. }
  2186. return v;
  2187. }
  2188. #else
  2189. template <class D, typename T = TFromD<D>>
  2190. HWY_API VFromD<D> GatherIndexN(D d, const T* HWY_RESTRICT base,
  2191. VFromD<RebindToSigned<D>> index,
  2192. const size_t max_lanes_to_load) {
  2193. return MaskedGatherIndex(FirstN(d, max_lanes_to_load), d, base, index);
  2194. }
  2195. template <class D, typename T = TFromD<D>>
  2196. HWY_API VFromD<D> GatherIndexNOr(VFromD<D> no, D d, const T* HWY_RESTRICT base,
  2197. VFromD<RebindToSigned<D>> index,
  2198. const size_t max_lanes_to_load) {
  2199. return MaskedGatherIndexOr(no, FirstN(d, max_lanes_to_load), d, base, index);
  2200. }
  2201. #endif // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE))
  2202. // ------------------------------ Integer AbsDiff and SumsOf8AbsDiff
  2203. #if (defined(HWY_NATIVE_INTEGER_ABS_DIFF) == defined(HWY_TARGET_TOGGLE))
  2204. #ifdef HWY_NATIVE_INTEGER_ABS_DIFF
  2205. #undef HWY_NATIVE_INTEGER_ABS_DIFF
  2206. #else
  2207. #define HWY_NATIVE_INTEGER_ABS_DIFF
  2208. #endif
  2209. template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  2210. HWY_API V AbsDiff(V a, V b) {
  2211. return Sub(Max(a, b), Min(a, b));
  2212. }
  2213. #endif // HWY_NATIVE_INTEGER_ABS_DIFF
  2214. #if (defined(HWY_NATIVE_SUMS_OF_8_ABS_DIFF) == defined(HWY_TARGET_TOGGLE))
  2215. #ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
  2216. #undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF
  2217. #else
  2218. #define HWY_NATIVE_SUMS_OF_8_ABS_DIFF
  2219. #endif
  2220. template <class V, HWY_IF_UI8_D(DFromV<V>),
  2221. HWY_IF_V_SIZE_GT_D(DFromV<V>, (HWY_TARGET == HWY_SCALAR ? 0 : 4))>
  2222. HWY_API Vec<RepartitionToWideX3<DFromV<V>>> SumsOf8AbsDiff(V a, V b) {
  2223. const DFromV<decltype(a)> d;
  2224. const RebindToUnsigned<decltype(d)> du;
  2225. const RepartitionToWideX3<decltype(d)> dw;
  2226. return BitCast(dw, SumsOf8(BitCast(du, AbsDiff(a, b))));
  2227. }
  2228. #endif // HWY_NATIVE_SUMS_OF_8_ABS_DIFF
  2229. // ------------------------------ SaturatedAdd/SaturatedSub for UI32/UI64
  2230. #if (defined(HWY_NATIVE_I32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
  2231. #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
  2232. #undef HWY_NATIVE_I32_SATURATED_ADDSUB
  2233. #else
  2234. #define HWY_NATIVE_I32_SATURATED_ADDSUB
  2235. #endif
  2236. template <class V, HWY_IF_I32_D(DFromV<V>)>
  2237. HWY_API V SaturatedAdd(V a, V b) {
  2238. const DFromV<decltype(a)> d;
  2239. const auto sum = Add(a, b);
  2240. const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
  2241. const auto overflow_result =
  2242. Xor(BroadcastSignBit(a), Set(d, LimitsMax<int32_t>()));
  2243. return IfNegativeThenElse(overflow_mask, overflow_result, sum);
  2244. }
  2245. template <class V, HWY_IF_I32_D(DFromV<V>)>
  2246. HWY_API V SaturatedSub(V a, V b) {
  2247. const DFromV<decltype(a)> d;
  2248. const auto diff = Sub(a, b);
  2249. const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
  2250. const auto overflow_result =
  2251. Xor(BroadcastSignBit(a), Set(d, LimitsMax<int32_t>()));
  2252. return IfNegativeThenElse(overflow_mask, overflow_result, diff);
  2253. }
  2254. #endif // HWY_NATIVE_I32_SATURATED_ADDSUB
  2255. #if (defined(HWY_NATIVE_I64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
  2256. #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
  2257. #undef HWY_NATIVE_I64_SATURATED_ADDSUB
  2258. #else
  2259. #define HWY_NATIVE_I64_SATURATED_ADDSUB
  2260. #endif
  2261. template <class V, HWY_IF_I64_D(DFromV<V>)>
  2262. HWY_API V SaturatedAdd(V a, V b) {
  2263. const DFromV<decltype(a)> d;
  2264. const auto sum = Add(a, b);
  2265. const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum));
  2266. const auto overflow_result =
  2267. Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
  2268. return IfNegativeThenElse(overflow_mask, overflow_result, sum);
  2269. }
  2270. template <class V, HWY_IF_I64_D(DFromV<V>)>
  2271. HWY_API V SaturatedSub(V a, V b) {
  2272. const DFromV<decltype(a)> d;
  2273. const auto diff = Sub(a, b);
  2274. const auto overflow_mask = And(Xor(a, b), Xor(a, diff));
  2275. const auto overflow_result =
  2276. Xor(BroadcastSignBit(a), Set(d, LimitsMax<int64_t>()));
  2277. return IfNegativeThenElse(overflow_mask, overflow_result, diff);
  2278. }
  2279. #endif // HWY_NATIVE_I64_SATURATED_ADDSUB
  2280. #if (defined(HWY_NATIVE_U32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
  2281. #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
  2282. #undef HWY_NATIVE_U32_SATURATED_ADDSUB
  2283. #else
  2284. #define HWY_NATIVE_U32_SATURATED_ADDSUB
  2285. #endif
  2286. template <class V, HWY_IF_U32_D(DFromV<V>)>
  2287. HWY_API V SaturatedAdd(V a, V b) {
  2288. return Add(a, Min(b, Not(a)));
  2289. }
  2290. template <class V, HWY_IF_U32_D(DFromV<V>)>
  2291. HWY_API V SaturatedSub(V a, V b) {
  2292. return Sub(a, Min(a, b));
  2293. }
  2294. #endif // HWY_NATIVE_U32_SATURATED_ADDSUB
  2295. #if (defined(HWY_NATIVE_U64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE))
  2296. #ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
  2297. #undef HWY_NATIVE_U64_SATURATED_ADDSUB
  2298. #else
  2299. #define HWY_NATIVE_U64_SATURATED_ADDSUB
  2300. #endif
  2301. template <class V, HWY_IF_U64_D(DFromV<V>)>
  2302. HWY_API V SaturatedAdd(V a, V b) {
  2303. return Add(a, Min(b, Not(a)));
  2304. }
  2305. template <class V, HWY_IF_U64_D(DFromV<V>)>
  2306. HWY_API V SaturatedSub(V a, V b) {
  2307. return Sub(a, Min(a, b));
  2308. }
  2309. #endif // HWY_NATIVE_U64_SATURATED_ADDSUB
  2310. // ------------------------------ Unsigned to signed demotions
  2311. template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
  2312. HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V),
  2313. class V2 = VFromD<Rebind<TFromV<V>, DN>>,
  2314. hwy::EnableIf<(sizeof(TFromD<DN>) < sizeof(TFromV<V>))>* = nullptr,
  2315. HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
  2316. HWY_API VFromD<DN> DemoteTo(DN dn, V v) {
  2317. const DFromV<decltype(v)> d;
  2318. const RebindToSigned<decltype(d)> di;
  2319. const RebindToUnsigned<decltype(dn)> dn_u;
  2320. // First, do a signed to signed demotion. This will convert any values
  2321. // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a
  2322. // negative value.
  2323. const auto i2i_demote_result = DemoteTo(dn, BitCast(di, v));
  2324. // Second, convert any negative values to hwy::HighestValue<TFromD<DN>>()
  2325. // using an unsigned Min operation.
  2326. const auto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>());
  2327. return BitCast(
  2328. dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val)));
  2329. }
  2330. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  2331. template <class DN, HWY_IF_SIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
  2332. HWY_IF_U2I_DEMOTE_FROM_LANE_SIZE_V(V),
  2333. class V2 = VFromD<Repartition<TFromV<V>, DN>>,
  2334. HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
  2335. HWY_IF_LANES_D(DFromV<V>, HWY_MAX_LANES_D(DFromV<V2>))>
  2336. HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
  2337. const DFromV<decltype(a)> d;
  2338. const RebindToSigned<decltype(d)> di;
  2339. const RebindToUnsigned<decltype(dn)> dn_u;
  2340. // First, do a signed to signed demotion. This will convert any values
  2341. // that are greater than hwy::HighestValue<MakeSigned<TFromV<V>>>() to a
  2342. // negative value.
  2343. const auto i2i_demote_result =
  2344. ReorderDemote2To(dn, BitCast(di, a), BitCast(di, b));
  2345. // Second, convert any negative values to hwy::HighestValue<TFromD<DN>>()
  2346. // using an unsigned Min operation.
  2347. const auto max_signed_val = Set(dn, hwy::HighestValue<TFromD<DN>>());
  2348. return BitCast(
  2349. dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val)));
  2350. }
  2351. #endif
  2352. // ------------------------------ PromoteLowerTo
  2353. // There is no codegen advantage for a native version of this. It is provided
  2354. // only for convenience.
  2355. template <class D, class V>
  2356. HWY_API VFromD<D> PromoteLowerTo(D d, V v) {
  2357. // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
  2358. // because it cannot be deduced from D (could be either bf16 or f16).
  2359. const Rebind<TFromV<V>, decltype(d)> dh;
  2360. return PromoteTo(d, LowerHalf(dh, v));
  2361. }
  2362. // ------------------------------ PromoteUpperTo
  2363. #if (defined(HWY_NATIVE_PROMOTE_UPPER_TO) == defined(HWY_TARGET_TOGGLE))
  2364. #ifdef HWY_NATIVE_PROMOTE_UPPER_TO
  2365. #undef HWY_NATIVE_PROMOTE_UPPER_TO
  2366. #else
  2367. #define HWY_NATIVE_PROMOTE_UPPER_TO
  2368. #endif
  2369. // This requires UpperHalf.
  2370. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  2371. template <class D, class V>
  2372. HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
  2373. // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
  2374. // because it cannot be deduced from D (could be either bf16 or f16).
  2375. const Rebind<TFromV<V>, decltype(d)> dh;
  2376. return PromoteTo(d, UpperHalf(dh, v));
  2377. }
  2378. #endif // HWY_TARGET != HWY_SCALAR
  2379. #endif // HWY_NATIVE_PROMOTE_UPPER_TO
  2380. // ------------------------------ float16_t <-> float
  2381. #if (defined(HWY_NATIVE_F16C) == defined(HWY_TARGET_TOGGLE))
  2382. #ifdef HWY_NATIVE_F16C
  2383. #undef HWY_NATIVE_F16C
  2384. #else
  2385. #define HWY_NATIVE_F16C
  2386. #endif
  2387. template <class D, HWY_IF_F32_D(D)>
  2388. HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<float16_t, D>> v) {
  2389. const RebindToSigned<decltype(df32)> di32;
  2390. const RebindToUnsigned<decltype(df32)> du32;
  2391. const Rebind<uint16_t, decltype(df32)> du16;
  2392. using VU32 = VFromD<decltype(du32)>;
  2393. const VU32 bits16 = PromoteTo(du32, BitCast(du16, v));
  2394. const VU32 sign = ShiftRight<15>(bits16);
  2395. const VU32 biased_exp = And(ShiftRight<10>(bits16), Set(du32, 0x1F));
  2396. const VU32 mantissa = And(bits16, Set(du32, 0x3FF));
  2397. const VU32 subnormal =
  2398. BitCast(du32, Mul(ConvertTo(df32, BitCast(di32, mantissa)),
  2399. Set(df32, 1.0f / 16384 / 1024)));
  2400. const VU32 biased_exp32 = Add(biased_exp, Set(du32, 127 - 15));
  2401. const VU32 mantissa32 = ShiftLeft<23 - 10>(mantissa);
  2402. const VU32 normal = Or(ShiftLeft<23>(biased_exp32), mantissa32);
  2403. const VU32 bits32 = IfThenElse(Eq(biased_exp, Zero(du32)), subnormal, normal);
  2404. return BitCast(df32, Or(ShiftLeft<31>(sign), bits32));
  2405. }
  2406. template <class D, HWY_IF_F16_D(D)>
  2407. HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<float, D>> v) {
  2408. const RebindToSigned<decltype(df16)> di16;
  2409. const Rebind<int32_t, decltype(df16)> di32;
  2410. const RebindToFloat<decltype(di32)> df32;
  2411. const RebindToUnsigned<decltype(df32)> du32;
  2412. // There are 23 fractional bits (plus the implied 1 bit) in the mantissa of
  2413. // a F32, and there are 10 fractional bits (plus the implied 1 bit) in the
  2414. // mantissa of a F16
  2415. // We want the unbiased exponent of round_incr[i] to be at least (-14) + 13 as
  2416. // 2^(-14) is the smallest positive normal F16 value and as we want 13
  2417. // mantissa bits (including the implicit 1 bit) to the left of the
  2418. // F32 mantissa bits in rounded_val[i] since 23 - 10 is equal to 13
  2419. // The biased exponent of round_incr[i] needs to be at least 126 as
  2420. // (-14) + 13 + 127 is equal to 126
  2421. // We also want to biased exponent of round_incr[i] to be less than or equal
  2422. // to 255 (which is equal to MaxExponentField<float>())
  2423. // The biased F32 exponent of round_incr is equal to
  2424. // HWY_MAX(HWY_MIN(((exp_bits[i] >> 23) & 255) + 13, 255), 126)
  2425. // hi9_bits[i] is equal to the upper 9 bits of v[i]
  2426. const auto hi9_bits = ShiftRight<23>(BitCast(du32, v));
  2427. const auto k13 = Set(du32, uint32_t{13u});
  2428. // Minimum biased F32 exponent of round_incr
  2429. const auto k126 = Set(du32, uint32_t{126u});
  2430. // round_incr_hi9_bits[i] is equivalent to
  2431. // (hi9_bits[i] & 0x100) |
  2432. // HWY_MAX(HWY_MIN((hi9_bits[i] & 0xFF) + 13, 255), 126)
  2433. #if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
  2434. const auto k255 = Set(du32, uint32_t{255u});
  2435. const auto round_incr_hi9_bits = BitwiseIfThenElse(
  2436. k255, Max(Min(Add(And(hi9_bits, k255), k13), k255), k126), hi9_bits);
  2437. #else
  2438. // On targets other than SCALAR and EMU128, the exponent bits of hi9_bits can
  2439. // be incremented by 13 and clamped to the [13, 255] range without overflowing
  2440. // into the sign bit of hi9_bits by using U8 SaturatedAdd as there are 8
  2441. // exponent bits in an F32
  2442. // U8 Max can be used on targets other than SCALAR and EMU128 to clamp
  2443. // ((hi9_bits & 0xFF) + 13) to the [126, 255] range without affecting the sign
  2444. // bit
  2445. const Repartition<uint8_t, decltype(du32)> du32_as_u8;
  2446. const auto round_incr_hi9_bits = BitCast(
  2447. du32,
  2448. Max(SaturatedAdd(BitCast(du32_as_u8, hi9_bits), BitCast(du32_as_u8, k13)),
  2449. BitCast(du32_as_u8, k126)));
  2450. #endif
  2451. // (round_incr_hi9_bits >> 8) is equal to (hi9_bits >> 8), and
  2452. // (round_incr_hi9_bits & 0xFF) is equal to
  2453. // HWY_MAX(HWY_MIN((round_incr_hi9_bits & 0xFF) + 13, 255), 126)
  2454. const auto round_incr = BitCast(df32, ShiftLeft<23>(round_incr_hi9_bits));
  2455. // Add round_incr[i] to v[i] to round the mantissa to the nearest F16 mantissa
  2456. // and to move the fractional bits of the resulting non-NaN mantissa down to
  2457. // the lower 10 bits of rounded_val if (v[i] + round_incr[i]) is a non-NaN
  2458. // value
  2459. const auto rounded_val = Add(v, round_incr);
  2460. // rounded_val_bits is the bits of rounded_val as a U32
  2461. const auto rounded_val_bits = BitCast(du32, rounded_val);
  2462. // rounded_val[i] is known to have the same biased exponent as round_incr[i]
  2463. // as |round_incr[i]| > 2^12*|v[i]| is true if round_incr[i] is a finite
  2464. // value, round_incr[i] and v[i] both have the same sign, and |round_incr[i]|
  2465. // is either a power of 2 that is greater than or equal to 2^-1 or infinity.
  2466. // If rounded_val[i] is a finite F32 value, then
  2467. // (rounded_val_bits[i] & 0x00000FFF) is the bit representation of the
  2468. // rounded mantissa of rounded_val[i] as a UQ2.10 fixed point number that is
  2469. // in the range [0, 2].
  2470. // In other words, (rounded_val_bits[i] & 0x00000FFF) is between 0 and 0x0800,
  2471. // with (rounded_val_bits[i] & 0x000003FF) being the fractional bits of the
  2472. // resulting F16 mantissa, if rounded_v[i] is a finite F32 value.
  2473. // (rounded_val_bits[i] & 0x007FF000) == 0 is guaranteed to be true if
  2474. // rounded_val[i] is a non-NaN value
  2475. // The biased exponent of rounded_val[i] is guaranteed to be at least 126 as
  2476. // the biased exponent of round_incr[i] is at least 126 and as both v[i] and
  2477. // round_incr[i] have the same sign bit
  2478. // The ULP of a F32 value with a biased exponent of 126 is equal to
  2479. // 2^(126 - 127 - 23), which is equal to 2^(-24) (which is also the ULP of a
  2480. // F16 value with a biased exponent of 0 or 1 as (1 - 15 - 10) is equal to
  2481. // -24)
  2482. // The biased exponent (before subtracting by 126) needs to be clamped to the
  2483. // [126, 157] range as 126 + 31 is equal to 157 and as 31 is the largest
  2484. // biased exponent of a F16.
  2485. // The biased exponent of the resulting F16 value is equal to
  2486. // HWY_MIN((round_incr_hi9_bits[i] & 0xFF) +
  2487. // ((rounded_val_bits[i] >> 10) & 0xFF), 157) - 126
  2488. #if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
  2489. const auto k157Shl10 = Set(du32, static_cast<uint32_t>(uint32_t{157u} << 10));
  2490. auto f16_exp_bits =
  2491. Min(Add(ShiftLeft<10>(And(round_incr_hi9_bits, k255)),
  2492. And(rounded_val_bits,
  2493. Set(du32, static_cast<uint32_t>(uint32_t{0xFFu} << 10)))),
  2494. k157Shl10);
  2495. const auto f16_result_is_inf_mask =
  2496. RebindMask(df32, Eq(f16_exp_bits, k157Shl10));
  2497. #else
  2498. const auto k157 = Set(du32, uint32_t{157});
  2499. auto f16_exp_bits = BitCast(
  2500. du32,
  2501. Min(SaturatedAdd(BitCast(du32_as_u8, round_incr_hi9_bits),
  2502. BitCast(du32_as_u8, ShiftRight<10>(rounded_val_bits))),
  2503. BitCast(du32_as_u8, k157)));
  2504. const auto f16_result_is_inf_mask = RebindMask(df32, Eq(f16_exp_bits, k157));
  2505. f16_exp_bits = ShiftLeft<10>(f16_exp_bits);
  2506. #endif
  2507. f16_exp_bits =
  2508. Sub(f16_exp_bits, Set(du32, static_cast<uint32_t>(uint32_t{126u} << 10)));
  2509. const auto f16_unmasked_mant_bits =
  2510. BitCast(di32, Or(IfThenZeroElse(f16_result_is_inf_mask, rounded_val),
  2511. VecFromMask(df32, IsNaN(rounded_val))));
  2512. const auto f16_exp_mant_bits =
  2513. OrAnd(BitCast(di32, f16_exp_bits), f16_unmasked_mant_bits,
  2514. Set(di32, int32_t{0x03FF}));
  2515. // f16_bits_as_i32 is the F16 bits sign-extended to an I32 (with the upper 17
  2516. // bits of f16_bits_as_i32[i] set to the sign bit of rounded_val[i]) to allow
  2517. // efficient truncation of the F16 bits to an I16 using an I32->I16 DemoteTo
  2518. // operation
  2519. const auto f16_bits_as_i32 =
  2520. OrAnd(f16_exp_mant_bits, ShiftRight<16>(BitCast(di32, rounded_val_bits)),
  2521. Set(di32, static_cast<int32_t>(0xFFFF8000u)));
  2522. return BitCast(df16, DemoteTo(di16, f16_bits_as_i32));
  2523. }
  2524. #endif // HWY_NATIVE_F16C
  2525. // ------------------------------ F64->F16 DemoteTo
  2526. #if (defined(HWY_NATIVE_DEMOTE_F64_TO_F16) == defined(HWY_TARGET_TOGGLE))
  2527. #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16
  2528. #undef HWY_NATIVE_DEMOTE_F64_TO_F16
  2529. #else
  2530. #define HWY_NATIVE_DEMOTE_F64_TO_F16
  2531. #endif
  2532. #if HWY_HAVE_FLOAT64
  2533. template <class D, HWY_IF_F16_D(D)>
  2534. HWY_API VFromD<D> DemoteTo(D df16, VFromD<Rebind<double, D>> v) {
  2535. const Rebind<double, D> df64;
  2536. const Rebind<uint64_t, D> du64;
  2537. const Rebind<float, D> df32;
  2538. // The mantissa bits of v[i] are first rounded using round-to-odd rounding to
  2539. // the nearest F64 value that has the lower 29 bits zeroed out to ensure that
  2540. // the result is correctly rounded to a F16.
  2541. const auto vf64_rounded = OrAnd(
  2542. And(v,
  2543. BitCast(df64, Set(du64, static_cast<uint64_t>(0xFFFFFFFFE0000000u)))),
  2544. BitCast(df64, Add(BitCast(du64, v),
  2545. Set(du64, static_cast<uint64_t>(0x000000001FFFFFFFu)))),
  2546. BitCast(df64, Set(du64, static_cast<uint64_t>(0x0000000020000000ULL))));
  2547. return DemoteTo(df16, DemoteTo(df32, vf64_rounded));
  2548. }
  2549. #endif // HWY_HAVE_FLOAT64
  2550. #endif // HWY_NATIVE_DEMOTE_F64_TO_F16
  2551. // ------------------------------ F16->F64 PromoteTo
  2552. #if (defined(HWY_NATIVE_PROMOTE_F16_TO_F64) == defined(HWY_TARGET_TOGGLE))
  2553. #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64
  2554. #undef HWY_NATIVE_PROMOTE_F16_TO_F64
  2555. #else
  2556. #define HWY_NATIVE_PROMOTE_F16_TO_F64
  2557. #endif
  2558. #if HWY_HAVE_FLOAT64
  2559. template <class D, HWY_IF_F64_D(D)>
  2560. HWY_API VFromD<D> PromoteTo(D df64, VFromD<Rebind<float16_t, D>> v) {
  2561. return PromoteTo(df64, PromoteTo(Rebind<float, D>(), v));
  2562. }
  2563. #endif // HWY_HAVE_FLOAT64
  2564. #endif // HWY_NATIVE_PROMOTE_F16_TO_F64
  2565. // ------------------------------ F32 to BF16 DemoteTo
  2566. #if (defined(HWY_NATIVE_DEMOTE_F32_TO_BF16) == defined(HWY_TARGET_TOGGLE))
  2567. #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
  2568. #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
  2569. #else
  2570. #define HWY_NATIVE_DEMOTE_F32_TO_BF16
  2571. #endif
  2572. namespace detail {
  2573. // Round a F32 value to the nearest BF16 value, with the result returned as the
  2574. // rounded F32 value bitcasted to an U32
  2575. // RoundF32ForDemoteToBF16 also converts NaN values to QNaN values to prevent
  2576. // NaN F32 values from being converted to an infinity
  2577. template <class V, HWY_IF_F32(TFromV<V>)>
  2578. HWY_INLINE VFromD<RebindToUnsigned<DFromV<V>>> RoundF32ForDemoteToBF16(V v) {
  2579. const DFromV<decltype(v)> d;
  2580. const RebindToUnsigned<decltype(d)> du32;
  2581. const auto is_non_nan = Not(IsNaN(v));
  2582. const auto bits32 = BitCast(du32, v);
  2583. const auto round_incr =
  2584. Add(And(ShiftRight<16>(bits32), Set(du32, uint32_t{1})),
  2585. Set(du32, uint32_t{0x7FFFu}));
  2586. return MaskedAddOr(Or(bits32, Set(du32, uint32_t{0x00400000u})),
  2587. RebindMask(du32, is_non_nan), bits32, round_incr);
  2588. }
  2589. } // namespace detail
  2590. template <class D, HWY_IF_BF16_D(D)>
  2591. HWY_API VFromD<D> DemoteTo(D dbf16, VFromD<Rebind<float, D>> v) {
  2592. const RebindToUnsigned<decltype(dbf16)> du16;
  2593. const Twice<decltype(du16)> dt_u16;
  2594. const auto rounded_bits = BitCast(dt_u16, detail::RoundF32ForDemoteToBF16(v));
  2595. #if HWY_IS_LITTLE_ENDIAN
  2596. return BitCast(
  2597. dbf16, LowerHalf(du16, ConcatOdd(dt_u16, rounded_bits, rounded_bits)));
  2598. #else
  2599. return BitCast(
  2600. dbf16, LowerHalf(du16, ConcatEven(dt_u16, rounded_bits, rounded_bits)));
  2601. #endif
  2602. }
  2603. template <class D, HWY_IF_BF16_D(D)>
  2604. HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
  2605. VFromD<Repartition<float, D>> b) {
  2606. const RebindToUnsigned<decltype(dbf16)> du16;
  2607. const auto rounded_a_bits32 =
  2608. BitCast(du16, detail::RoundF32ForDemoteToBF16(a));
  2609. const auto rounded_b_bits32 =
  2610. BitCast(du16, detail::RoundF32ForDemoteToBF16(b));
  2611. #if HWY_IS_LITTLE_ENDIAN
  2612. return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, rounded_b_bits32),
  2613. BitCast(du16, rounded_a_bits32)));
  2614. #else
  2615. return BitCast(dbf16, ConcatEven(du16, BitCast(du16, rounded_b_bits32),
  2616. BitCast(du16, rounded_a_bits32)));
  2617. #endif
  2618. }
  2619. template <class D, HWY_IF_BF16_D(D)>
  2620. HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
  2621. VFromD<Repartition<float, D>> b) {
  2622. const RebindToUnsigned<decltype(dbf16)> du16;
  2623. #if HWY_IS_LITTLE_ENDIAN
  2624. const auto a_in_odd = detail::RoundF32ForDemoteToBF16(a);
  2625. const auto b_in_even = ShiftRight<16>(detail::RoundF32ForDemoteToBF16(b));
  2626. #else
  2627. const auto a_in_odd = ShiftRight<16>(detail::RoundF32ForDemoteToBF16(a));
  2628. const auto b_in_even = detail::RoundF32ForDemoteToBF16(b);
  2629. #endif
  2630. return BitCast(dbf16,
  2631. OddEven(BitCast(du16, a_in_odd), BitCast(du16, b_in_even)));
  2632. }
  2633. #endif // HWY_NATIVE_DEMOTE_F32_TO_BF16
  2634. // ------------------------------ PromoteInRangeTo
  2635. #if (defined(HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO) == \
  2636. defined(HWY_TARGET_TOGGLE))
  2637. #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
  2638. #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
  2639. #else
  2640. #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
  2641. #endif
  2642. #if HWY_HAVE_INTEGER64
  2643. template <class D64, HWY_IF_UI64_D(D64)>
  2644. HWY_API VFromD<D64> PromoteInRangeTo(D64 d64, VFromD<Rebind<float, D64>> v) {
  2645. return PromoteTo(d64, v);
  2646. }
  2647. #endif
  2648. #endif // HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
  2649. // ------------------------------ ConvertInRangeTo
  2650. #if (defined(HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO) == defined(HWY_TARGET_TOGGLE))
  2651. #ifdef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
  2652. #undef HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
  2653. #else
  2654. #define HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
  2655. #endif
  2656. template <class DI, HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(DI),
  2657. HWY_IF_T_SIZE_ONE_OF_D(DI, (HWY_HAVE_FLOAT16 ? (1 << 2) : 0) |
  2658. (1 << 4) |
  2659. (HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
  2660. HWY_API VFromD<DI> ConvertInRangeTo(DI di, VFromD<RebindToFloat<DI>> v) {
  2661. return ConvertTo(di, v);
  2662. }
  2663. #endif // HWY_NATIVE_F2I_CONVERT_IN_RANGE_TO
  2664. // ------------------------------ DemoteInRangeTo
  2665. #if (defined(HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO) == \
  2666. defined(HWY_TARGET_TOGGLE))
  2667. #ifdef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
  2668. #undef HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
  2669. #else
  2670. #define HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
  2671. #endif
  2672. #if HWY_HAVE_FLOAT64
  2673. template <class D32, HWY_IF_UI32_D(D32)>
  2674. HWY_API VFromD<D32> DemoteInRangeTo(D32 d32, VFromD<Rebind<double, D32>> v) {
  2675. return DemoteTo(d32, v);
  2676. }
  2677. #endif
  2678. #endif // HWY_NATIVE_F64_TO_UI32_DEMOTE_IN_RANGE_TO
  2679. // ------------------------------ PromoteInRangeLowerTo/PromoteInRangeUpperTo
  2680. template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
  2681. HWY_API VFromD<D> PromoteInRangeLowerTo(D d, V v) {
  2682. // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
  2683. // because it cannot be deduced from D (could be either bf16 or f16).
  2684. const Rebind<TFromV<V>, decltype(d)> dh;
  2685. return PromoteInRangeTo(d, LowerHalf(dh, v));
  2686. }
  2687. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  2688. template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
  2689. HWY_API VFromD<D> PromoteInRangeUpperTo(D d, V v) {
  2690. #if (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
  2691. (HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64))
  2692. // On targets that provide target-specific implementations of F32->UI64
  2693. // PromoteInRangeTo, promote the upper half of v using PromoteInRangeTo
  2694. // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
  2695. // because it cannot be deduced from D (could be either bf16 or f16).
  2696. const Rebind<TFromV<V>, decltype(d)> dh;
  2697. return PromoteInRangeTo(d, UpperHalf(dh, v));
  2698. #else
  2699. // Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper
  2700. // around F32->UI64 PromoteTo, promote the upper half of v to TFromD<D> using
  2701. // PromoteUpperTo
  2702. return PromoteUpperTo(d, v);
  2703. #endif
  2704. }
  2705. #endif // HWY_TARGET != HWY_SCALAR
  2706. // ------------------------------ PromoteInRangeEvenTo/PromoteInRangeOddTo
  2707. template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
  2708. HWY_API VFromD<D> PromoteInRangeEvenTo(D d, V v) {
  2709. #if HWY_TARGET == HWY_SCALAR
  2710. return PromoteInRangeTo(d, v);
  2711. #elif (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
  2712. (HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64))
  2713. // On targets that provide target-specific implementations of F32->UI64
  2714. // PromoteInRangeTo, promote the even lanes of v using PromoteInRangeTo
  2715. // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
  2716. // because it cannot be deduced from D (could be either bf16 or f16).
  2717. const DFromV<decltype(v)> d_from;
  2718. const Rebind<TFromV<V>, decltype(d)> dh;
  2719. return PromoteInRangeTo(d, LowerHalf(dh, ConcatEven(d_from, v, v)));
  2720. #else
  2721. // Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper
  2722. // around F32->UI64 PromoteTo, promote the even lanes of v to TFromD<D> using
  2723. // PromoteEvenTo
  2724. return PromoteEvenTo(d, v);
  2725. #endif // HWY_TARGET == HWY_SCALAR
  2726. }
  2727. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  2728. template <class D, HWY_IF_UI64_D(D), class V, HWY_IF_F32(TFromV<V>)>
  2729. HWY_API VFromD<D> PromoteInRangeOddTo(D d, V v) {
  2730. #if (HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_EMU128 || \
  2731. (HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64))
  2732. // On targets that provide target-specific implementations of F32->UI64
  2733. // PromoteInRangeTo, promote the odd lanes of v using PromoteInRangeTo
  2734. // Lanes(d) may differ from Lanes(DFromV<V>()). Use the lane type from V
  2735. // because it cannot be deduced from D (could be either bf16 or f16).
  2736. const DFromV<decltype(v)> d_from;
  2737. const Rebind<TFromV<V>, decltype(d)> dh;
  2738. return PromoteInRangeTo(d, LowerHalf(dh, ConcatOdd(d_from, v, v)));
  2739. #else
  2740. // Otherwise, on targets where F32->UI64 PromoteInRangeTo is simply a wrapper
  2741. // around F32->UI64 PromoteTo, promote the odd lanes of v to TFromD<D> using
  2742. // PromoteOddTo
  2743. return PromoteOddTo(d, v);
  2744. #endif
  2745. }
  2746. #endif // HWY_TARGET != HWY_SCALAR
  2747. // ------------------------------ SumsOf2
  2748. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  2749. namespace detail {
  2750. template <class TypeTag, size_t kLaneSize, class V>
  2751. HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
  2752. TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) {
  2753. const DFromV<decltype(v)> d;
  2754. const RepartitionToWide<decltype(d)> dw;
  2755. return Add(PromoteEvenTo(dw, v), PromoteOddTo(dw, v));
  2756. }
  2757. } // namespace detail
  2758. template <class V>
  2759. HWY_API VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(V v) {
  2760. return detail::SumsOf2(hwy::TypeTag<TFromV<V>>(),
  2761. hwy::SizeTag<sizeof(TFromV<V>)>(), v);
  2762. }
  2763. #endif // HWY_TARGET != HWY_SCALAR
  2764. // ------------------------------ SumsOf4
  2765. namespace detail {
  2766. template <class TypeTag, size_t kLaneSize, class V>
  2767. HWY_INLINE VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(
  2768. TypeTag /*type_tag*/, hwy::SizeTag<kLaneSize> /*lane_size_tag*/, V v) {
  2769. using hwy::HWY_NAMESPACE::SumsOf2;
  2770. return SumsOf2(SumsOf2(v));
  2771. }
  2772. } // namespace detail
  2773. template <class V>
  2774. HWY_API VFromD<RepartitionToWideX2<DFromV<V>>> SumsOf4(V v) {
  2775. return detail::SumsOf4(hwy::TypeTag<TFromV<V>>(),
  2776. hwy::SizeTag<sizeof(TFromV<V>)>(), v);
  2777. }
  2778. // ------------------------------ OrderedTruncate2To
  2779. #if HWY_IDE || \
  2780. (defined(HWY_NATIVE_ORDERED_TRUNCATE_2_TO) == defined(HWY_TARGET_TOGGLE))
  2781. #ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
  2782. #undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO
  2783. #else
  2784. #define HWY_NATIVE_ORDERED_TRUNCATE_2_TO
  2785. #endif
  2786. // (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
  2787. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  2788. template <class DN, HWY_IF_UNSIGNED_D(DN), class V, HWY_IF_UNSIGNED_V(V),
  2789. HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
  2790. HWY_IF_LANES_D(DFromV<VFromD<DN>>, HWY_MAX_LANES_D(DFromV<V>) * 2)>
  2791. HWY_API VFromD<DN> OrderedTruncate2To(DN dn, V a, V b) {
  2792. return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a));
  2793. }
  2794. #endif // HWY_TARGET != HWY_SCALAR
  2795. #endif // HWY_NATIVE_ORDERED_TRUNCATE_2_TO
  2796. // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
  2797. #if (defined(HWY_NATIVE_LEADING_ZERO_COUNT) == defined(HWY_TARGET_TOGGLE))
  2798. #ifdef HWY_NATIVE_LEADING_ZERO_COUNT
  2799. #undef HWY_NATIVE_LEADING_ZERO_COUNT
  2800. #else
  2801. #define HWY_NATIVE_LEADING_ZERO_COUNT
  2802. #endif
  2803. namespace detail {
  2804. template <class D, HWY_IF_U32_D(D)>
  2805. HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
  2806. const RebindToFloat<decltype(d)> df;
  2807. #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2
  2808. const RebindToSigned<decltype(d)> di;
  2809. const Repartition<int16_t, decltype(d)> di16;
  2810. // On SSE2/SSSE3/SSE4/AVX2, do an int32_t to float conversion, followed
  2811. // by a unsigned right shift of the uint32_t bit representation of the
  2812. // floating point values by 23, followed by an int16_t Min
  2813. // operation as we are only interested in the biased exponent that would
  2814. // result from a uint32_t to float conversion.
  2815. // An int32_t to float vector conversion is also much more efficient on
  2816. // SSE2/SSSE3/SSE4/AVX2 than an uint32_t vector to float vector conversion
  2817. // as an uint32_t vector to float vector conversion on SSE2/SSSE3/SSE4/AVX2
  2818. // requires multiple instructions whereas an int32_t to float vector
  2819. // conversion can be carried out using a single instruction on
  2820. // SSE2/SSSE3/SSE4/AVX2.
  2821. const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(di, v)));
  2822. return BitCast(d, Min(BitCast(di16, ShiftRight<23>(f32_bits)),
  2823. BitCast(di16, Set(d, 158))));
  2824. #else
  2825. const auto f32_bits = BitCast(d, ConvertTo(df, v));
  2826. return BitCast(d, ShiftRight<23>(f32_bits));
  2827. #endif
  2828. }
  2829. template <class V, HWY_IF_U32_D(DFromV<V>)>
  2830. HWY_INLINE V I32RangeU32ToF32BiasedExp(V v) {
  2831. // I32RangeU32ToF32BiasedExp is similar to UIntToF32BiasedExp, but
  2832. // I32RangeU32ToF32BiasedExp assumes that v[i] is between 0 and 2147483647.
  2833. const DFromV<decltype(v)> d;
  2834. const RebindToFloat<decltype(d)> df;
  2835. #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2
  2836. const RebindToSigned<decltype(d)> d_src;
  2837. #else
  2838. const RebindToUnsigned<decltype(d)> d_src;
  2839. #endif
  2840. const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(d_src, v)));
  2841. return ShiftRight<23>(f32_bits);
  2842. }
  2843. template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
  2844. HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
  2845. const Rebind<uint32_t, decltype(d)> du32;
  2846. const auto f32_biased_exp_as_u32 =
  2847. I32RangeU32ToF32BiasedExp(PromoteTo(du32, v));
  2848. return TruncateTo(d, f32_biased_exp_as_u32);
  2849. }
  2850. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  2851. template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4)>
  2852. HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
  2853. const Half<decltype(d)> dh;
  2854. const Rebind<uint32_t, decltype(dh)> du32;
  2855. const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v));
  2856. const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v));
  2857. const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32);
  2858. const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32);
  2859. #if HWY_TARGET <= HWY_SSE2
  2860. const RebindToSigned<decltype(du32)> di32;
  2861. const RebindToSigned<decltype(d)> di;
  2862. return BitCast(d,
  2863. OrderedDemote2To(di, BitCast(di32, lo_f32_biased_exp_as_u32),
  2864. BitCast(di32, hi_f32_biased_exp_as_u32)));
  2865. #else
  2866. return OrderedTruncate2To(d, lo_f32_biased_exp_as_u32,
  2867. hi_f32_biased_exp_as_u32);
  2868. #endif
  2869. }
  2870. #endif // HWY_TARGET != HWY_SCALAR
  2871. template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 4)>
  2872. HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
  2873. const Rebind<uint32_t, decltype(d)> du32;
  2874. const auto f32_biased_exp_as_u32 =
  2875. I32RangeU32ToF32BiasedExp(PromoteTo(du32, v));
  2876. return U8FromU32(f32_biased_exp_as_u32);
  2877. }
  2878. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  2879. template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 4),
  2880. HWY_IF_LANES_LE_D(D, HWY_MAX_BYTES / 2)>
  2881. HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
  2882. const Half<decltype(d)> dh;
  2883. const Rebind<uint32_t, decltype(dh)> du32;
  2884. const Repartition<uint16_t, decltype(du32)> du16;
  2885. const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v));
  2886. const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v));
  2887. const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32);
  2888. const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32);
  2889. #if HWY_TARGET <= HWY_SSE2
  2890. const RebindToSigned<decltype(du32)> di32;
  2891. const RebindToSigned<decltype(du16)> di16;
  2892. const auto f32_biased_exp_as_i16 =
  2893. OrderedDemote2To(di16, BitCast(di32, lo_f32_biased_exp_as_u32),
  2894. BitCast(di32, hi_f32_biased_exp_as_u32));
  2895. return DemoteTo(d, f32_biased_exp_as_i16);
  2896. #else
  2897. const auto f32_biased_exp_as_u16 = OrderedTruncate2To(
  2898. du16, lo_f32_biased_exp_as_u32, hi_f32_biased_exp_as_u32);
  2899. return TruncateTo(d, f32_biased_exp_as_u16);
  2900. #endif
  2901. }
  2902. template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, HWY_MAX_BYTES / 2)>
  2903. HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
  2904. const Half<decltype(d)> dh;
  2905. const Half<decltype(dh)> dq;
  2906. const Rebind<uint32_t, decltype(dq)> du32;
  2907. const Repartition<uint16_t, decltype(du32)> du16;
  2908. const auto lo_half = LowerHalf(dh, v);
  2909. const auto hi_half = UpperHalf(dh, v);
  2910. const auto u32_q0 = PromoteTo(du32, LowerHalf(dq, lo_half));
  2911. const auto u32_q1 = PromoteTo(du32, UpperHalf(dq, lo_half));
  2912. const auto u32_q2 = PromoteTo(du32, LowerHalf(dq, hi_half));
  2913. const auto u32_q3 = PromoteTo(du32, UpperHalf(dq, hi_half));
  2914. const auto f32_biased_exp_as_u32_q0 = I32RangeU32ToF32BiasedExp(u32_q0);
  2915. const auto f32_biased_exp_as_u32_q1 = I32RangeU32ToF32BiasedExp(u32_q1);
  2916. const auto f32_biased_exp_as_u32_q2 = I32RangeU32ToF32BiasedExp(u32_q2);
  2917. const auto f32_biased_exp_as_u32_q3 = I32RangeU32ToF32BiasedExp(u32_q3);
  2918. #if HWY_TARGET <= HWY_SSE2
  2919. const RebindToSigned<decltype(du32)> di32;
  2920. const RebindToSigned<decltype(du16)> di16;
  2921. const auto lo_f32_biased_exp_as_i16 =
  2922. OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q0),
  2923. BitCast(di32, f32_biased_exp_as_u32_q1));
  2924. const auto hi_f32_biased_exp_as_i16 =
  2925. OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q2),
  2926. BitCast(di32, f32_biased_exp_as_u32_q3));
  2927. return OrderedDemote2To(d, lo_f32_biased_exp_as_i16,
  2928. hi_f32_biased_exp_as_i16);
  2929. #else
  2930. const auto lo_f32_biased_exp_as_u16 = OrderedTruncate2To(
  2931. du16, f32_biased_exp_as_u32_q0, f32_biased_exp_as_u32_q1);
  2932. const auto hi_f32_biased_exp_as_u16 = OrderedTruncate2To(
  2933. du16, f32_biased_exp_as_u32_q2, f32_biased_exp_as_u32_q3);
  2934. return OrderedTruncate2To(d, lo_f32_biased_exp_as_u16,
  2935. hi_f32_biased_exp_as_u16);
  2936. #endif
  2937. }
  2938. #endif // HWY_TARGET != HWY_SCALAR
  2939. #if HWY_TARGET == HWY_SCALAR
  2940. template <class D>
  2941. using F32ExpLzcntMinMaxRepartition = RebindToUnsigned<D>;
  2942. #elif HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2
  2943. template <class D>
  2944. using F32ExpLzcntMinMaxRepartition = Repartition<uint8_t, D>;
  2945. #else
  2946. template <class D>
  2947. using F32ExpLzcntMinMaxRepartition =
  2948. Repartition<UnsignedFromSize<HWY_MIN(sizeof(TFromD<D>), 4)>, D>;
  2949. #endif
  2950. template <class V>
  2951. using F32ExpLzcntMinMaxCmpV = VFromD<F32ExpLzcntMinMaxRepartition<DFromV<V>>>;
  2952. template <class V>
  2953. HWY_INLINE F32ExpLzcntMinMaxCmpV<V> F32ExpLzcntMinMaxBitCast(V v) {
  2954. const DFromV<decltype(v)> d;
  2955. const F32ExpLzcntMinMaxRepartition<decltype(d)> d2;
  2956. return BitCast(d2, v);
  2957. }
  2958. template <class D, HWY_IF_U64_D(D)>
  2959. HWY_INLINE VFromD<D> UIntToF32BiasedExp(D d, VFromD<D> v) {
  2960. #if HWY_TARGET == HWY_SCALAR
  2961. const uint64_t u64_val = GetLane(v);
  2962. const float f32_val = static_cast<float>(u64_val);
  2963. const uint32_t f32_bits = BitCastScalar<uint32_t>(f32_val);
  2964. return Set(d, static_cast<uint64_t>(f32_bits >> 23));
  2965. #else
  2966. const Repartition<uint32_t, decltype(d)> du32;
  2967. const auto f32_biased_exp = UIntToF32BiasedExp(du32, BitCast(du32, v));
  2968. const auto f32_biased_exp_adj =
  2969. IfThenZeroElse(Eq(f32_biased_exp, Zero(du32)),
  2970. BitCast(du32, Set(d, 0x0000002000000000u)));
  2971. const auto adj_f32_biased_exp = Add(f32_biased_exp, f32_biased_exp_adj);
  2972. return ShiftRight<32>(BitCast(
  2973. d, Max(F32ExpLzcntMinMaxBitCast(adj_f32_biased_exp),
  2974. F32ExpLzcntMinMaxBitCast(Reverse2(du32, adj_f32_biased_exp)))));
  2975. #endif
  2976. }
  2977. template <class V, HWY_IF_UNSIGNED_V(V)>
  2978. HWY_INLINE V UIntToF32BiasedExp(V v) {
  2979. const DFromV<decltype(v)> d;
  2980. return UIntToF32BiasedExp(d, v);
  2981. }
  2982. template <class V, HWY_IF_UNSIGNED_V(V),
  2983. HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
  2984. HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) {
  2985. return v;
  2986. }
  2987. template <class V, HWY_IF_UNSIGNED_V(V),
  2988. HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
  2989. HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) {
  2990. // If v[i] >= 16777216 is true, make sure that the bit at
  2991. // HighestSetBitIndex(v[i]) - 24 is zeroed out to ensure that any inexact
  2992. // conversion to single-precision floating point is rounded down.
  2993. // This zeroing-out can be accomplished through the AndNot operation below.
  2994. return AndNot(ShiftRight<24>(v), v);
  2995. }
  2996. } // namespace detail
  2997. template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  2998. HWY_API V HighestSetBitIndex(V v) {
  2999. const DFromV<decltype(v)> d;
  3000. const RebindToUnsigned<decltype(d)> du;
  3001. using TU = TFromD<decltype(du)>;
  3002. const auto f32_biased_exp = detail::UIntToF32BiasedExp(
  3003. detail::NormalizeForUIntTruncConvToF32(BitCast(du, v)));
  3004. return BitCast(d, Sub(f32_biased_exp, Set(du, TU{127})));
  3005. }
  3006. template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  3007. HWY_API V LeadingZeroCount(V v) {
  3008. const DFromV<decltype(v)> d;
  3009. const RebindToUnsigned<decltype(d)> du;
  3010. using TU = TFromD<decltype(du)>;
  3011. constexpr TU kNumOfBitsInT{sizeof(TU) * 8};
  3012. const auto f32_biased_exp = detail::UIntToF32BiasedExp(
  3013. detail::NormalizeForUIntTruncConvToF32(BitCast(du, v)));
  3014. const auto lz_count = Sub(Set(du, TU{kNumOfBitsInT + 126}), f32_biased_exp);
  3015. return BitCast(d,
  3016. Min(detail::F32ExpLzcntMinMaxBitCast(lz_count),
  3017. detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT))));
  3018. }
  3019. template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  3020. HWY_API V TrailingZeroCount(V v) {
  3021. const DFromV<decltype(v)> d;
  3022. const RebindToUnsigned<decltype(d)> du;
  3023. const RebindToSigned<decltype(d)> di;
  3024. using TU = TFromD<decltype(du)>;
  3025. const auto vi = BitCast(di, v);
  3026. const auto lowest_bit = BitCast(du, And(vi, Neg(vi)));
  3027. constexpr TU kNumOfBitsInT{sizeof(TU) * 8};
  3028. const auto f32_biased_exp = detail::UIntToF32BiasedExp(lowest_bit);
  3029. const auto tz_count = Sub(f32_biased_exp, Set(du, TU{127}));
  3030. return BitCast(d,
  3031. Min(detail::F32ExpLzcntMinMaxBitCast(tz_count),
  3032. detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT))));
  3033. }
  3034. #endif // HWY_NATIVE_LEADING_ZERO_COUNT
  3035. // ------------------------------ AESRound
  3036. // Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
  3037. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  3038. // Define for white-box testing, even if native instructions are available.
  3039. namespace detail {
  3040. // Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with
  3041. // Vector Permute Instructions" and the accompanying assembly language
  3042. // implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan:
  3043. // https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html .
  3044. //
  3045. // A brute-force 256 byte table lookup can also be made constant-time, and
  3046. // possibly competitive on NEON, but this is more performance-portable
  3047. // especially for x86 and large vectors.
  3048. template <class V> // u8
  3049. HWY_INLINE V SubBytesMulInverseAndAffineLookup(V state, V affine_tblL,
  3050. V affine_tblU) {
  3051. const DFromV<V> du;
  3052. const auto mask = Set(du, uint8_t{0xF});
  3053. // Change polynomial basis to GF(2^4)
  3054. {
  3055. const VFromD<decltype(du)> basisL =
  3056. Dup128VecFromValues(du, 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
  3057. 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA);
  3058. const VFromD<decltype(du)> basisU =
  3059. Dup128VecFromValues(du, 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
  3060. 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD);
  3061. const auto sL = And(state, mask);
  3062. const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
  3063. const auto gf4L = TableLookupBytes(basisL, sL);
  3064. const auto gf4U = TableLookupBytes(basisU, sU);
  3065. state = Xor(gf4L, gf4U);
  3066. }
  3067. // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and
  3068. // cause TableLookupBytesOr0 to return 0.
  3069. const VFromD<decltype(du)> zetaInv = Dup128VecFromValues(
  3070. du, 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3);
  3071. const VFromD<decltype(du)> tbl = Dup128VecFromValues(
  3072. du, 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4);
  3073. const auto sL = And(state, mask); // L=low nibble, U=upper
  3074. const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
  3075. const auto sX = Xor(sU, sL);
  3076. const auto invL = TableLookupBytes(zetaInv, sL);
  3077. const auto invU = TableLookupBytes(tbl, sU);
  3078. const auto invX = TableLookupBytes(tbl, sX);
  3079. const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU)));
  3080. const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX)));
  3081. const auto affL = TableLookupBytesOr0(affine_tblL, outL);
  3082. const auto affU = TableLookupBytesOr0(affine_tblU, outU);
  3083. return Xor(affL, affU);
  3084. }
  3085. template <class V> // u8
  3086. HWY_INLINE V SubBytes(V state) {
  3087. const DFromV<V> du;
  3088. // Linear skew (cannot bake 0x63 bias into the table because out* indices
  3089. // may have the infinity flag set).
  3090. const VFromD<decltype(du)> affineL =
  3091. Dup128VecFromValues(du, 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
  3092. 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15);
  3093. const VFromD<decltype(du)> affineU =
  3094. Dup128VecFromValues(du, 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
  3095. 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E);
  3096. return Xor(SubBytesMulInverseAndAffineLookup(state, affineL, affineU),
  3097. Set(du, uint8_t{0x63}));
  3098. }
  3099. template <class V> // u8
  3100. HWY_INLINE V InvSubBytes(V state) {
  3101. const DFromV<V> du;
  3102. const VFromD<decltype(du)> gF2P4InvToGF2P8InvL =
  3103. Dup128VecFromValues(du, 0x00, 0x40, 0xF9, 0x7E, 0x53, 0xEA, 0x87, 0x13,
  3104. 0x2D, 0x3E, 0x94, 0xD4, 0xB9, 0x6D, 0xAA, 0xC7);
  3105. const VFromD<decltype(du)> gF2P4InvToGF2P8InvU =
  3106. Dup128VecFromValues(du, 0x00, 0x1D, 0x44, 0x93, 0x0F, 0x56, 0xD7, 0x12,
  3107. 0x9C, 0x8E, 0xC5, 0xD8, 0x59, 0x81, 0x4B, 0xCA);
  3108. // Apply the inverse affine transformation
  3109. const auto b = Xor(Xor3(Or(ShiftLeft<1>(state), ShiftRight<7>(state)),
  3110. Or(ShiftLeft<3>(state), ShiftRight<5>(state)),
  3111. Or(ShiftLeft<6>(state), ShiftRight<2>(state))),
  3112. Set(du, uint8_t{0x05}));
  3113. // The GF(2^8) multiplicative inverse is computed as follows:
  3114. // - Changing the polynomial basis to GF(2^4)
  3115. // - Computing the GF(2^4) multiplicative inverse
  3116. // - Converting the GF(2^4) multiplicative inverse to the GF(2^8)
  3117. // multiplicative inverse through table lookups using the
  3118. // kGF2P4InvToGF2P8InvL and kGF2P4InvToGF2P8InvU tables
  3119. return SubBytesMulInverseAndAffineLookup(b, gF2P4InvToGF2P8InvL,
  3120. gF2P4InvToGF2P8InvU);
  3121. }
  3122. } // namespace detail
  3123. #endif // HWY_TARGET != HWY_SCALAR
  3124. #if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
  3125. #ifdef HWY_NATIVE_AES
  3126. #undef HWY_NATIVE_AES
  3127. #else
  3128. #define HWY_NATIVE_AES
  3129. #endif
  3130. // (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
  3131. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  3132. namespace detail {
  3133. template <class V> // u8
  3134. HWY_INLINE V ShiftRows(const V state) {
  3135. const DFromV<V> du;
  3136. // transposed: state is column major
  3137. const VFromD<decltype(du)> shift_row = Dup128VecFromValues(
  3138. du, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11);
  3139. return TableLookupBytes(state, shift_row);
  3140. }
  3141. template <class V> // u8
  3142. HWY_INLINE V InvShiftRows(const V state) {
  3143. const DFromV<V> du;
  3144. // transposed: state is column major
  3145. const VFromD<decltype(du)> shift_row = Dup128VecFromValues(
  3146. du, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3);
  3147. return TableLookupBytes(state, shift_row);
  3148. }
  3149. template <class V> // u8
  3150. HWY_INLINE V GF2P8Mod11BMulBy2(V v) {
  3151. const DFromV<V> du;
  3152. const RebindToSigned<decltype(du)> di; // can only do signed comparisons
  3153. const auto msb = Lt(BitCast(di, v), Zero(di));
  3154. const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, int8_t{0x1B})));
  3155. return Xor(Add(v, v), overflow); // = v*2 in GF(2^8).
  3156. }
  3157. template <class V> // u8
  3158. HWY_INLINE V MixColumns(const V state) {
  3159. const DFromV<V> du;
  3160. // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
  3161. // 2 3 1 1 // Let s := state*1, d := state*2, t := state*3.
  3162. // 1 2 3 1 // d are on diagonal, no permutation needed.
  3163. // 1 1 2 3 // t1230 indicates column indices of threes for the 4 rows.
  3164. // 3 1 1 2 // We also need to compute s2301 and s3012 (=1230 o 2301).
  3165. const VFromD<decltype(du)> v2301 = Dup128VecFromValues(
  3166. du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
  3167. const VFromD<decltype(du)> v1230 = Dup128VecFromValues(
  3168. du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
  3169. const auto d = GF2P8Mod11BMulBy2(state); // = state*2 in GF(2^8).
  3170. const auto s2301 = TableLookupBytes(state, v2301);
  3171. const auto d_s2301 = Xor(d, s2301);
  3172. const auto t_s2301 = Xor(state, d_s2301); // t(s*3) = XOR-sum {s, d(s*2)}
  3173. const auto t1230_s3012 = TableLookupBytes(t_s2301, v1230);
  3174. return Xor(d_s2301, t1230_s3012); // XOR-sum of 4 terms
  3175. }
  3176. template <class V> // u8
  3177. HWY_INLINE V InvMixColumns(const V state) {
  3178. const DFromV<V> du;
  3179. // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
  3180. // 14 11 13 9
  3181. // 9 14 11 13
  3182. // 13 9 14 11
  3183. // 11 13 9 14
  3184. const VFromD<decltype(du)> v2301 = Dup128VecFromValues(
  3185. du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
  3186. const VFromD<decltype(du)> v1230 = Dup128VecFromValues(
  3187. du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
  3188. const auto sx2 = GF2P8Mod11BMulBy2(state); /* = state*2 in GF(2^8) */
  3189. const auto sx4 = GF2P8Mod11BMulBy2(sx2); /* = state*4 in GF(2^8) */
  3190. const auto sx8 = GF2P8Mod11BMulBy2(sx4); /* = state*8 in GF(2^8) */
  3191. const auto sx9 = Xor(sx8, state); /* = state*9 in GF(2^8) */
  3192. const auto sx11 = Xor(sx9, sx2); /* = state*11 in GF(2^8) */
  3193. const auto sx13 = Xor(sx9, sx4); /* = state*13 in GF(2^8) */
  3194. const auto sx14 = Xor3(sx8, sx4, sx2); /* = state*14 in GF(2^8) */
  3195. const auto sx13_0123_sx9_1230 = Xor(sx13, TableLookupBytes(sx9, v1230));
  3196. const auto sx14_0123_sx11_1230 = Xor(sx14, TableLookupBytes(sx11, v1230));
  3197. const auto sx13_2301_sx9_3012 = TableLookupBytes(sx13_0123_sx9_1230, v2301);
  3198. return Xor(sx14_0123_sx11_1230, sx13_2301_sx9_3012);
  3199. }
  3200. } // namespace detail
  3201. template <class V> // u8
  3202. HWY_API V AESRound(V state, const V round_key) {
  3203. // Intel docs swap the first two steps, but it does not matter because
  3204. // ShiftRows is a permutation and SubBytes is independent of lane index.
  3205. state = detail::SubBytes(state);
  3206. state = detail::ShiftRows(state);
  3207. state = detail::MixColumns(state);
  3208. state = Xor(state, round_key); // AddRoundKey
  3209. return state;
  3210. }
  3211. template <class V> // u8
  3212. HWY_API V AESLastRound(V state, const V round_key) {
  3213. // LIke AESRound, but without MixColumns.
  3214. state = detail::SubBytes(state);
  3215. state = detail::ShiftRows(state);
  3216. state = Xor(state, round_key); // AddRoundKey
  3217. return state;
  3218. }
  3219. template <class V>
  3220. HWY_API V AESInvMixColumns(V state) {
  3221. return detail::InvMixColumns(state);
  3222. }
  3223. template <class V> // u8
  3224. HWY_API V AESRoundInv(V state, const V round_key) {
  3225. state = detail::InvSubBytes(state);
  3226. state = detail::InvShiftRows(state);
  3227. state = detail::InvMixColumns(state);
  3228. state = Xor(state, round_key); // AddRoundKey
  3229. return state;
  3230. }
  3231. template <class V> // u8
  3232. HWY_API V AESLastRoundInv(V state, const V round_key) {
  3233. // Like AESRoundInv, but without InvMixColumns.
  3234. state = detail::InvSubBytes(state);
  3235. state = detail::InvShiftRows(state);
  3236. state = Xor(state, round_key); // AddRoundKey
  3237. return state;
  3238. }
  3239. template <uint8_t kRcon, class V, HWY_IF_U8_D(DFromV<V>)>
  3240. HWY_API V AESKeyGenAssist(V v) {
  3241. const DFromV<decltype(v)> d;
  3242. const V rconXorMask = Dup128VecFromValues(d, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0,
  3243. 0, 0, kRcon, 0, 0, 0);
  3244. const V rotWordShuffle = Dup128VecFromValues(d, 4, 5, 6, 7, 5, 6, 7, 4, 12,
  3245. 13, 14, 15, 13, 14, 15, 12);
  3246. const auto sub_word_result = detail::SubBytes(v);
  3247. const auto rot_word_result =
  3248. TableLookupBytes(sub_word_result, rotWordShuffle);
  3249. return Xor(rot_word_result, rconXorMask);
  3250. }
  3251. // Constant-time implementation inspired by
  3252. // https://www.bearssl.org/constanttime.html, but about half the cost because we
  3253. // use 64x64 multiplies and 128-bit XORs.
  3254. template <class V>
  3255. HWY_API V CLMulLower(V a, V b) {
  3256. const DFromV<V> d;
  3257. static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
  3258. const auto k1 = Set(d, 0x1111111111111111ULL);
  3259. const auto k2 = Set(d, 0x2222222222222222ULL);
  3260. const auto k4 = Set(d, 0x4444444444444444ULL);
  3261. const auto k8 = Set(d, 0x8888888888888888ULL);
  3262. const auto a0 = And(a, k1);
  3263. const auto a1 = And(a, k2);
  3264. const auto a2 = And(a, k4);
  3265. const auto a3 = And(a, k8);
  3266. const auto b0 = And(b, k1);
  3267. const auto b1 = And(b, k2);
  3268. const auto b2 = And(b, k4);
  3269. const auto b3 = And(b, k8);
  3270. auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3));
  3271. auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0));
  3272. auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1));
  3273. auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2));
  3274. m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1)));
  3275. m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2)));
  3276. m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3)));
  3277. m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0)));
  3278. return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
  3279. }
  3280. template <class V>
  3281. HWY_API V CLMulUpper(V a, V b) {
  3282. const DFromV<V> d;
  3283. static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
  3284. const auto k1 = Set(d, 0x1111111111111111ULL);
  3285. const auto k2 = Set(d, 0x2222222222222222ULL);
  3286. const auto k4 = Set(d, 0x4444444444444444ULL);
  3287. const auto k8 = Set(d, 0x8888888888888888ULL);
  3288. const auto a0 = And(a, k1);
  3289. const auto a1 = And(a, k2);
  3290. const auto a2 = And(a, k4);
  3291. const auto a3 = And(a, k8);
  3292. const auto b0 = And(b, k1);
  3293. const auto b1 = And(b, k2);
  3294. const auto b2 = And(b, k4);
  3295. const auto b3 = And(b, k8);
  3296. auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
  3297. auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
  3298. auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
  3299. auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
  3300. m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
  3301. m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
  3302. m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
  3303. m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
  3304. return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
  3305. }
  3306. #endif // HWY_NATIVE_AES
  3307. #endif // HWY_TARGET != HWY_SCALAR
  3308. // ------------------------------ PopulationCount
  3309. #if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
  3310. #ifdef HWY_NATIVE_POPCNT
  3311. #undef HWY_NATIVE_POPCNT
  3312. #else
  3313. #define HWY_NATIVE_POPCNT
  3314. #endif
  3315. // This overload requires vectors to be at least 16 bytes, which is the case
  3316. // for LMUL >= 2.
  3317. #undef HWY_IF_POPCNT
  3318. #if HWY_TARGET == HWY_RVV
  3319. #define HWY_IF_POPCNT(D) \
  3320. hwy::EnableIf<D().Pow2() >= 1 && D().MaxLanes() >= 16>* = nullptr
  3321. #else
  3322. // Other targets only have these two overloads which are mutually exclusive, so
  3323. // no further conditions are required.
  3324. #define HWY_IF_POPCNT(D) void* = nullptr
  3325. #endif // HWY_TARGET == HWY_RVV
  3326. template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
  3327. HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)>
  3328. HWY_API V PopulationCount(V v) {
  3329. const D d;
  3330. const V lookup =
  3331. Dup128VecFromValues(d, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
  3332. const auto lo = And(v, Set(d, uint8_t{0xF}));
  3333. const auto hi = ShiftRight<4>(v);
  3334. return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
  3335. }
  3336. // RVV has a specialization that avoids the Set().
  3337. #if HWY_TARGET != HWY_RVV
  3338. // Slower fallback for capped vectors.
  3339. template <class V, class D = DFromV<V>, HWY_IF_U8_D(D),
  3340. HWY_IF_V_SIZE_LE_D(D, 8)>
  3341. HWY_API V PopulationCount(V v) {
  3342. const D d;
  3343. // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
  3344. const V k33 = Set(d, uint8_t{0x33});
  3345. v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
  3346. v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
  3347. return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
  3348. }
  3349. #endif // HWY_TARGET != HWY_RVV
  3350. template <class V, class D = DFromV<V>, HWY_IF_U16_D(D)>
  3351. HWY_API V PopulationCount(V v) {
  3352. const D d;
  3353. const Repartition<uint8_t, decltype(d)> d8;
  3354. const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
  3355. return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF})));
  3356. }
  3357. template <class V, class D = DFromV<V>, HWY_IF_U32_D(D)>
  3358. HWY_API V PopulationCount(V v) {
  3359. const D d;
  3360. Repartition<uint16_t, decltype(d)> d16;
  3361. auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
  3362. return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF})));
  3363. }
  3364. #if HWY_HAVE_INTEGER64
  3365. template <class V, class D = DFromV<V>, HWY_IF_U64_D(D)>
  3366. HWY_API V PopulationCount(V v) {
  3367. const D d;
  3368. Repartition<uint32_t, decltype(d)> d32;
  3369. auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
  3370. return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
  3371. }
  3372. #endif
  3373. #endif // HWY_NATIVE_POPCNT
  3374. // ------------------------------ 8-bit multiplication
  3375. #if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
  3376. #ifdef HWY_NATIVE_MUL_8
  3377. #undef HWY_NATIVE_MUL_8
  3378. #else
  3379. #define HWY_NATIVE_MUL_8
  3380. #endif
  3381. // 8 bit and fits in wider reg: promote
  3382. template <class V, HWY_IF_T_SIZE_V(V, 1),
  3383. HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
  3384. HWY_API V operator*(const V a, const V b) {
  3385. const DFromV<decltype(a)> d;
  3386. const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
  3387. const RebindToUnsigned<decltype(d)> du; // TruncateTo result
  3388. const RebindToUnsigned<decltype(dw)> dwu; // TruncateTo input
  3389. const VFromD<decltype(dw)> mul = PromoteTo(dw, a) * PromoteTo(dw, b);
  3390. // TruncateTo is cheaper than ConcatEven.
  3391. return BitCast(d, TruncateTo(du, BitCast(dwu, mul)));
  3392. }
  3393. // 8 bit full reg: promote halves
  3394. template <class V, HWY_IF_T_SIZE_V(V, 1),
  3395. HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
  3396. HWY_API V operator*(const V a, const V b) {
  3397. const DFromV<decltype(a)> d;
  3398. const Half<decltype(d)> dh;
  3399. const Twice<RepartitionToWide<decltype(dh)>> dw;
  3400. const VFromD<decltype(dw)> a0 = PromoteTo(dw, LowerHalf(dh, a));
  3401. const VFromD<decltype(dw)> a1 = PromoteTo(dw, UpperHalf(dh, a));
  3402. const VFromD<decltype(dw)> b0 = PromoteTo(dw, LowerHalf(dh, b));
  3403. const VFromD<decltype(dw)> b1 = PromoteTo(dw, UpperHalf(dh, b));
  3404. const VFromD<decltype(dw)> m0 = a0 * b0;
  3405. const VFromD<decltype(dw)> m1 = a1 * b1;
  3406. return ConcatEven(d, BitCast(d, m1), BitCast(d, m0));
  3407. }
  3408. #endif // HWY_NATIVE_MUL_8
  3409. // ------------------------------ 64-bit multiplication
  3410. #if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
  3411. #ifdef HWY_NATIVE_MUL_64
  3412. #undef HWY_NATIVE_MUL_64
  3413. #else
  3414. #define HWY_NATIVE_MUL_64
  3415. #endif
  3416. // Single-lane i64 or u64
  3417. template <class V, HWY_IF_T_SIZE_V(V, 8), HWY_IF_V_SIZE_V(V, 8),
  3418. HWY_IF_NOT_FLOAT_V(V)>
  3419. HWY_API V operator*(V x, V y) {
  3420. const DFromV<V> d;
  3421. using T = TFromD<decltype(d)>;
  3422. using TU = MakeUnsigned<T>;
  3423. const TU xu = static_cast<TU>(GetLane(x));
  3424. const TU yu = static_cast<TU>(GetLane(y));
  3425. return Set(d, static_cast<T>(xu * yu));
  3426. }
  3427. template <class V, class D64 = DFromV<V>, HWY_IF_U64_D(D64),
  3428. HWY_IF_V_SIZE_GT_D(D64, 8)>
  3429. HWY_API V operator*(V x, V y) {
  3430. RepartitionToNarrow<D64> d32;
  3431. auto x32 = BitCast(d32, x);
  3432. auto y32 = BitCast(d32, y);
  3433. auto lolo = BitCast(d32, MulEven(x32, y32));
  3434. auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
  3435. auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
  3436. auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
  3437. return BitCast(D64{}, lolo + hi);
  3438. }
  3439. template <class V, class DI64 = DFromV<V>, HWY_IF_I64_D(DI64),
  3440. HWY_IF_V_SIZE_GT_D(DI64, 8)>
  3441. HWY_API V operator*(V x, V y) {
  3442. RebindToUnsigned<DI64> du64;
  3443. return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
  3444. }
  3445. #endif // HWY_NATIVE_MUL_64
  3446. // ------------------------------ MulAdd / NegMulAdd
  3447. #if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE))
  3448. #ifdef HWY_NATIVE_INT_FMA
  3449. #undef HWY_NATIVE_INT_FMA
  3450. #else
  3451. #define HWY_NATIVE_INT_FMA
  3452. #endif
  3453. #ifdef HWY_NATIVE_INT_FMSUB
  3454. #undef HWY_NATIVE_INT_FMSUB
  3455. #else
  3456. #define HWY_NATIVE_INT_FMSUB
  3457. #endif
  3458. template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  3459. HWY_API V MulAdd(V mul, V x, V add) {
  3460. return Add(Mul(mul, x), add);
  3461. }
  3462. template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  3463. HWY_API V NegMulAdd(V mul, V x, V add) {
  3464. return Sub(add, Mul(mul, x));
  3465. }
  3466. template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  3467. HWY_API V MulSub(V mul, V x, V sub) {
  3468. return Sub(Mul(mul, x), sub);
  3469. }
  3470. #endif // HWY_NATIVE_INT_FMA
  3471. // ------------------------------ Integer MulSub / NegMulSub
  3472. #if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE))
  3473. #ifdef HWY_NATIVE_INT_FMSUB
  3474. #undef HWY_NATIVE_INT_FMSUB
  3475. #else
  3476. #define HWY_NATIVE_INT_FMSUB
  3477. #endif
  3478. template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  3479. HWY_API V MulSub(V mul, V x, V sub) {
  3480. const DFromV<decltype(mul)> d;
  3481. const RebindToSigned<decltype(d)> di;
  3482. return MulAdd(mul, x, BitCast(d, Neg(BitCast(di, sub))));
  3483. }
  3484. #endif // HWY_NATIVE_INT_FMSUB
  3485. template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  3486. HWY_API V NegMulSub(V mul, V x, V sub) {
  3487. const DFromV<decltype(mul)> d;
  3488. const RebindToSigned<decltype(d)> di;
  3489. return BitCast(d, Neg(BitCast(di, MulAdd(mul, x, sub))));
  3490. }
  3491. // ------------------------------ MulAddSub
  3492. // MulAddSub(mul, x, sub_or_add) for a 1-lane vector is equivalent to
  3493. // MulSub(mul, x, sub_or_add)
  3494. template <class V, HWY_IF_LANES_D(DFromV<V>, 1)>
  3495. HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
  3496. return MulSub(mul, x, sub_or_add);
  3497. }
  3498. // MulAddSub for F16/F32/F64 vectors with 2 or more lanes on
  3499. // SSSE3/SSE4/AVX2/AVX3 is implemented in x86_128-inl.h, x86_256-inl.h, and
  3500. // x86_512-inl.h
  3501. // MulAddSub for F16/F32/F64 vectors on SVE is implemented in arm_sve-inl.h
  3502. // MulAddSub for integer vectors on SVE2 is implemented in arm_sve-inl.h
  3503. template <class V, HWY_IF_MULADDSUB_V(V)>
  3504. HWY_API V MulAddSub(V mul, V x, V sub_or_add) {
  3505. using D = DFromV<V>;
  3506. using T = TFromD<D>;
  3507. using TNegate = If<!IsSigned<T>(), MakeSigned<T>, T>;
  3508. const D d;
  3509. const Rebind<TNegate, D> d_negate;
  3510. const auto add =
  3511. OddEven(sub_or_add, BitCast(d, Neg(BitCast(d_negate, sub_or_add))));
  3512. return MulAdd(mul, x, add);
  3513. }
  3514. // ------------------------------ Integer division
  3515. #if (defined(HWY_NATIVE_INT_DIV) == defined(HWY_TARGET_TOGGLE))
  3516. #ifdef HWY_NATIVE_INT_DIV
  3517. #undef HWY_NATIVE_INT_DIV
  3518. #else
  3519. #define HWY_NATIVE_INT_DIV
  3520. #endif
  3521. namespace detail {
  3522. // DemoteInRangeTo, PromoteInRangeTo, and ConvertInRangeTo are okay to use in
  3523. // the implementation of detail::IntDiv in generic_ops-inl.h as the current
  3524. // implementations of DemoteInRangeTo, PromoteInRangeTo, and ConvertInRangeTo
  3525. // will convert values that are outside of the range of TFromD<DI> by either
  3526. // saturation, truncation, or converting values that are outside of the
  3527. // destination range to LimitsMin<TFromD<DI>>() (which is equal to
  3528. // static_cast<TFromD<DI>>(LimitsMax<TFromD<DI>>() + 1))
  3529. template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
  3530. HWY_INLINE Vec<D> IntDivConvFloatToInt(D di, V vf) {
  3531. return ConvertInRangeTo(di, vf);
  3532. }
  3533. template <class D, class V, HWY_IF_T_SIZE_D(D, sizeof(TFromV<V>))>
  3534. HWY_INLINE Vec<D> IntDivConvIntToFloat(D df, V vi) {
  3535. return ConvertTo(df, vi);
  3536. }
  3537. #if !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
  3538. template <class D, class V, HWY_IF_UI64_D(D), HWY_IF_F32(TFromV<V>)>
  3539. HWY_INLINE Vec<D> IntDivConvFloatToInt(D df, V vi) {
  3540. return PromoteInRangeTo(df, vi);
  3541. }
  3542. // If !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 is true, then UI64->F32
  3543. // IntDivConvIntToFloat(df, vi) returns an approximation of
  3544. // static_cast<float>(v[i]) that is within 4 ULP of static_cast<float>(v[i])
  3545. template <class D, class V, HWY_IF_F32_D(D), HWY_IF_I64(TFromV<V>)>
  3546. HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vi) {
  3547. const Twice<decltype(df32)> dt_f32;
  3548. auto vf32 =
  3549. ConvertTo(dt_f32, BitCast(RebindToSigned<decltype(dt_f32)>(), vi));
  3550. #if HWY_IS_LITTLE_ENDIAN
  3551. const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
  3552. auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
  3553. #else
  3554. const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
  3555. auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
  3556. #endif
  3557. const RebindToSigned<decltype(df32)> di32;
  3558. hi_f32 =
  3559. Add(hi_f32, And(BitCast(df32, BroadcastSignBit(BitCast(di32, lo_f32))),
  3560. Set(df32, 1.0f)));
  3561. return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
  3562. }
  3563. template <class D, class V, HWY_IF_F32_D(D), HWY_IF_U64(TFromV<V>)>
  3564. HWY_INLINE Vec<D> IntDivConvIntToFloat(D df32, V vu) {
  3565. const Twice<decltype(df32)> dt_f32;
  3566. auto vf32 =
  3567. ConvertTo(dt_f32, BitCast(RebindToUnsigned<decltype(dt_f32)>(), vu));
  3568. #if HWY_IS_LITTLE_ENDIAN
  3569. const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
  3570. const auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
  3571. #else
  3572. const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32));
  3573. const auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32));
  3574. #endif
  3575. return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32);
  3576. }
  3577. #endif // !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
  3578. template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
  3579. HWY_IF_T_SIZE_GT(TFromV<V>, kOrigLaneSize)>
  3580. HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
  3581. const DFromV<decltype(a)> d;
  3582. const RebindToFloat<decltype(d)> df;
  3583. // If kOrigLaneSize < sizeof(T) is true, then a[i] and b[i] are both in the
  3584. // [LimitsMin<SignedFromSize<kOrigLaneSize>>(),
  3585. // LimitsMax<UnsignedFromSize<kOrigLaneSize>>()] range.
  3586. // floor(|a[i] / b[i]|) <= |flt_q| < floor(|a[i] / b[i]|) + 1 is also
  3587. // guaranteed to be true if MakeFloat<T> has at least kOrigLaneSize*8 + 1
  3588. // mantissa bits (including the implied one bit), where flt_q is equal to
  3589. // static_cast<MakeFloat<T>>(a[i]) / static_cast<MakeFloat<T>>(b[i]),
  3590. // even in the case where the magnitude of an inexact floating point division
  3591. // result is rounded up.
  3592. // In other words, floor(flt_q) < flt_q < ceil(flt_q) is guaranteed to be true
  3593. // if (a[i] % b[i]) != 0 is true and MakeFloat<T> has at least
  3594. // kOrigLaneSize*8 + 1 mantissa bits (including the implied one bit), even in
  3595. // the case where the magnitude of an inexact floating point division result
  3596. // is rounded up.
  3597. // It is okay to do conversions from MakeFloat<TFromV<V>> to TFromV<V> using
  3598. // ConvertInRangeTo if sizeof(TFromV<V>) > kOrigLaneSize as the result of the
  3599. // floating point division is always greater than LimitsMin<TFromV<V>>() and
  3600. // less than LimitsMax<TFromV<V>>() if sizeof(TFromV<V>) > kOrigLaneSize and
  3601. // b[i] != 0.
  3602. #if HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64
  3603. // On Armv7, do division by multiplying by the ApproximateReciprocal
  3604. // to avoid unnecessary overhead as F32 Div refines the approximate
  3605. // reciprocal using 4 Newton-Raphson iterations
  3606. const RebindToSigned<decltype(d)> di;
  3607. const RebindToUnsigned<decltype(d)> du;
  3608. const auto flt_b = ConvertTo(df, b);
  3609. auto flt_recip_b = ApproximateReciprocal(flt_b);
  3610. if (kOrigLaneSize > 1) {
  3611. flt_recip_b =
  3612. Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
  3613. }
  3614. auto q0 = ConvertInRangeTo(d, Mul(ConvertTo(df, a), flt_recip_b));
  3615. const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
  3616. auto r1 = r0;
  3617. // Need to negate r1[i] if a[i] < 0 is true
  3618. if (IsSigned<TFromV<V>>()) {
  3619. r1 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r1);
  3620. }
  3621. // r1[i] is now equal to (a[i] < 0) ? (-r0[i]) : r0[i]
  3622. auto abs_b = BitCast(du, b);
  3623. if (IsSigned<TFromV<V>>()) {
  3624. abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
  3625. }
  3626. // If (r1[i] < 0 || r1[i] >= abs_b[i]) is true, then set q1[i] to -1.
  3627. // Otherwise, set q1[i] to 0.
  3628. // (r1[i] < 0 || r1[i] >= abs_b[i]) can be carried out using a single unsigned
  3629. // comparison as static_cast<TU>(r1[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i]
  3630. // will be true if r1[i] < 0 is true.
  3631. auto q1 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r1), abs_b)));
  3632. // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ? -1 : 0
  3633. // Need to negate q1[i] if r0[i] and b[i] do not have the same sign
  3634. auto q1_negate_mask = r0;
  3635. if (IsSigned<TFromV<V>>()) {
  3636. q1_negate_mask = Xor(q1_negate_mask, BitCast(di, b));
  3637. }
  3638. q1 = IfNegativeThenElse(q1_negate_mask, Neg(q1), q1);
  3639. // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ?
  3640. // (((r0[i] ^ b[i]) < 0) ? 1 : -1)
  3641. // Need to subtract q1[i] from q0[i] to get the final result
  3642. return Sub(q0, BitCast(d, q1));
  3643. #else
  3644. // On targets other than Armv7 NEON, use F16 or F32 division as most targets
  3645. // other than Armv7 NEON have native F32 divide instructions
  3646. return ConvertInRangeTo(d, Div(ConvertTo(df, a), ConvertTo(df, b)));
  3647. #endif
  3648. }
  3649. template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
  3650. HWY_IF_T_SIZE(TFromV<V>, kOrigLaneSize),
  3651. HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))>
  3652. HWY_INLINE V IntDivUsingFloatDiv(V a, V b) {
  3653. // If kOrigLaneSize == sizeof(T) is true, at least two reciprocal
  3654. // multiplication steps are needed as the mantissa of MakeFloat<T> has fewer
  3655. // than kOrigLaneSize*8 + 1 bits
  3656. using T = TFromV<V>;
  3657. #if HWY_HAVE_FLOAT64
  3658. using TF = MakeFloat<T>;
  3659. #else
  3660. using TF = float;
  3661. #endif
  3662. const DFromV<decltype(a)> d;
  3663. const RebindToSigned<decltype(d)> di;
  3664. const RebindToUnsigned<decltype(d)> du;
  3665. const Rebind<TF, decltype(d)> df;
  3666. if (!IsSigned<T>()) {
  3667. // If T is unsigned, set a[i] to (a[i] >= b[i] ? 1 : 0) and set b[i] to 1 if
  3668. // b[i] > LimitsMax<MakeSigned<T>>() is true
  3669. const auto one = Set(di, MakeSigned<T>{1});
  3670. a = BitCast(
  3671. d, IfNegativeThenElse(BitCast(di, b),
  3672. IfThenElseZero(RebindMask(di, Ge(a, b)), one),
  3673. BitCast(di, a)));
  3674. b = BitCast(d, IfNegativeThenElse(BitCast(di, b), one, BitCast(di, b)));
  3675. }
  3676. // LimitsMin<T>() <= b[i] <= LimitsMax<MakeSigned<T>>() is now true
  3677. const auto flt_b = IntDivConvIntToFloat(df, b);
  3678. #if HWY_TARGET_IS_NEON && !HWY_HAVE_FLOAT64
  3679. auto flt_recip_b = ApproximateReciprocal(flt_b);
  3680. flt_recip_b =
  3681. Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b));
  3682. #else
  3683. const auto flt_recip_b = Div(Set(df, TF(1.0)), flt_b);
  3684. #endif
  3685. // It is okay if the conversion of a[i] * flt_recip_b[i] to T using
  3686. // IntDivConvFloatToInt returns incorrect results in any lanes where b[i] == 0
  3687. // as the result of IntDivUsingFloatDiv(a, b) is implementation-defined in any
  3688. // lanes where b[i] == 0.
  3689. // If ScalarAbs(b[i]) == 1 is true, then it is possible for
  3690. // a[i] * flt_recip_b[i] to be rounded up to a value that is outside of the
  3691. // range of T. If a[i] * flt_recip_b[i] is outside of the range of T,
  3692. // IntDivConvFloatToInt will convert any values that are out of the range of T
  3693. // by either saturation, truncation, or wrapping around to LimitsMin<T>().
  3694. // It is okay if the conversion of a[i] * flt_recip_b[i] to T using
  3695. // IntDivConvFloatToInt wraps around if ScalarAbs(b[i]) == 1 as r0 will have
  3696. // the correct sign if ScalarAbs(b[i]) == 1, even in the cases where the
  3697. // conversion of a[i] * flt_recip_b[i] to T using IntDivConvFloatToInt is
  3698. // truncated or wraps around.
  3699. // If ScalarAbs(b[i]) >= 2 is true, a[i] * flt_recip_b[i] will be within the
  3700. // range of T, even in the cases where the conversion of a[i] to TF is
  3701. // rounded up or the result of multiplying a[i] by flt_recip_b[i] is rounded
  3702. // up.
  3703. // ScalarAbs(r0[i]) will also always be less than (LimitsMax<T>() / 2) if
  3704. // b[i] != 0, even in the cases where the conversion of a[i] * flt_recip_b[i]
  3705. // to T using IntDivConvFloatToInt is truncated or is wrapped around.
  3706. auto q0 =
  3707. IntDivConvFloatToInt(d, Mul(IntDivConvIntToFloat(df, a), flt_recip_b));
  3708. const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a));
  3709. // If b[i] != 0 is true, r0[i] * flt_recip_b[i] is always within the range of
  3710. // T, even in the cases where the conversion of r0[i] to TF is rounded up or
  3711. // the multiplication of r0[i] by flt_recip_b[i] is rounded up.
  3712. auto q1 =
  3713. IntDivConvFloatToInt(di, Mul(IntDivConvIntToFloat(df, r0), flt_recip_b));
  3714. const auto r1 = hwy::HWY_NAMESPACE::NegMulAdd(q1, BitCast(di, b), r0);
  3715. auto r3 = r1;
  3716. #if !HWY_HAVE_FLOAT64
  3717. // Need two additional reciprocal multiplication steps for I64/U64 vectors if
  3718. // HWY_HAVE_FLOAT64 is 0
  3719. if (sizeof(T) == 8) {
  3720. const auto q2 = IntDivConvFloatToInt(
  3721. di, Mul(IntDivConvIntToFloat(df, r1), flt_recip_b));
  3722. const auto r2 = hwy::HWY_NAMESPACE::NegMulAdd(q2, BitCast(di, b), r1);
  3723. const auto q3 = IntDivConvFloatToInt(
  3724. di, Mul(IntDivConvIntToFloat(df, r2), flt_recip_b));
  3725. r3 = hwy::HWY_NAMESPACE::NegMulAdd(q3, BitCast(di, b), r2);
  3726. q0 = Add(q0, BitCast(d, q2));
  3727. q1 = Add(q1, q3);
  3728. }
  3729. #endif // !HWY_HAVE_FLOAT64
  3730. auto r4 = r3;
  3731. // Need to negate r4[i] if a[i] < 0 is true
  3732. if (IsSigned<TFromV<V>>()) {
  3733. r4 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r4);
  3734. }
  3735. // r4[i] is now equal to (a[i] < 0) ? (-r3[i]) : r3[i]
  3736. auto abs_b = BitCast(du, b);
  3737. if (IsSigned<TFromV<V>>()) {
  3738. abs_b = BitCast(du, Abs(BitCast(di, abs_b)));
  3739. }
  3740. // If (r4[i] < 0 || r4[i] >= abs_b[i]) is true, then set q4[i] to -1.
  3741. // Otherwise, set r4[i] to 0.
  3742. // (r4[i] < 0 || r4[i] >= abs_b[i]) can be carried out using a single unsigned
  3743. // comparison as static_cast<TU>(r4[i]) >= TU(LimitsMax<TI>() + 1) >= abs_b[i]
  3744. // will be true if r4[i] < 0 is true.
  3745. auto q4 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r4), abs_b)));
  3746. // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ? -1 : 0
  3747. // Need to negate q4[i] if r3[i] and b[i] do not have the same sign
  3748. auto q4_negate_mask = r3;
  3749. if (IsSigned<TFromV<V>>()) {
  3750. q4_negate_mask = Xor(q4_negate_mask, BitCast(di, b));
  3751. }
  3752. q4 = IfNegativeThenElse(q4_negate_mask, Neg(q4), q4);
  3753. // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ?
  3754. // (((r3[i] ^ b[i]) < 0) ? 1 : -1)
  3755. // The final result is equal to q0[i] + q1[i] - q4[i]
  3756. return Sub(Add(q0, BitCast(d, q1)), BitCast(d, q4));
  3757. }
  3758. template <size_t kOrigLaneSize, class V,
  3759. HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2)),
  3760. HWY_IF_V_SIZE_LE_V(
  3761. V, HWY_MAX_BYTES /
  3762. ((!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1) ? 4 : 2))>
  3763. HWY_INLINE V IntDiv(V a, V b) {
  3764. using T = TFromV<V>;
  3765. // If HWY_HAVE_FLOAT16 is 0, need to promote I8 to I32 and U8 to U32
  3766. using TW = MakeWide<
  3767. If<(!HWY_HAVE_FLOAT16 && sizeof(TFromV<V>) == 1), MakeWide<T>, T>>;
  3768. const DFromV<decltype(a)> d;
  3769. const Rebind<TW, decltype(d)> dw;
  3770. #if HWY_TARGET <= HWY_SSE2
  3771. // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid
  3772. // unnecessary overhead
  3773. const RebindToSigned<decltype(dw)> dw_i;
  3774. // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<T> if
  3775. // kOrigLaneSize < sizeof(T) to avoid unnecessary overhead
  3776. const If<(kOrigLaneSize < sizeof(T)), RebindToSigned<decltype(d)>,
  3777. decltype(d)>
  3778. d_demote_to;
  3779. #else
  3780. // On other targets, promote to TW and demote to T
  3781. const decltype(dw) dw_i;
  3782. const decltype(d) d_demote_to;
  3783. #endif
  3784. return BitCast(
  3785. d, DemoteTo(d_demote_to, IntDivUsingFloatDiv<kOrigLaneSize>(
  3786. PromoteTo(dw_i, a), PromoteTo(dw_i, b))));
  3787. }
  3788. template <size_t kOrigLaneSize, class V,
  3789. HWY_IF_T_SIZE_ONE_OF_V(V,
  3790. (HWY_HAVE_FLOAT16 ? (1 << 1) : 0) | (1 << 2)),
  3791. HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
  3792. HWY_INLINE V IntDiv(V a, V b) {
  3793. const DFromV<decltype(a)> d;
  3794. const RepartitionToWide<decltype(d)> dw;
  3795. #if HWY_TARGET <= HWY_SSE2
  3796. // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned<TW> to avoid
  3797. // unnecessary overhead
  3798. const RebindToSigned<decltype(dw)> dw_i;
  3799. // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned<TFromV<V>> if
  3800. // kOrigLaneSize < sizeof(TFromV<V>) to avoid unnecessary overhead
  3801. const If<(kOrigLaneSize < sizeof(TFromV<V>)), RebindToSigned<decltype(d)>,
  3802. decltype(d)>
  3803. d_demote_to;
  3804. #else
  3805. // On other targets, promote to MakeWide<TFromV<V>> and demote to TFromV<V>
  3806. const decltype(dw) dw_i;
  3807. const decltype(d) d_demote_to;
  3808. #endif
  3809. return BitCast(d, OrderedDemote2To(
  3810. d_demote_to,
  3811. IntDivUsingFloatDiv<kOrigLaneSize>(
  3812. PromoteLowerTo(dw_i, a), PromoteLowerTo(dw_i, b)),
  3813. IntDivUsingFloatDiv<kOrigLaneSize>(
  3814. PromoteUpperTo(dw_i, a), PromoteUpperTo(dw_i, b))));
  3815. }
  3816. #if !HWY_HAVE_FLOAT16
  3817. template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
  3818. HWY_IF_V_SIZE_V(V, HWY_MAX_BYTES / 2)>
  3819. HWY_INLINE V IntDiv(V a, V b) {
  3820. const DFromV<decltype(a)> d;
  3821. const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
  3822. #if HWY_TARGET <= HWY_SSE2
  3823. // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary
  3824. // overhead
  3825. const RebindToSigned<decltype(dw)> dw_i;
  3826. #else
  3827. // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V>
  3828. const decltype(dw) dw_i;
  3829. #endif
  3830. return DemoteTo(d,
  3831. BitCast(dw_i, IntDiv<1>(PromoteTo(dw, a), PromoteTo(dw, b))));
  3832. }
  3833. template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
  3834. HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
  3835. HWY_INLINE V IntDiv(V a, V b) {
  3836. const DFromV<decltype(a)> d;
  3837. const RepartitionToWide<decltype(d)> dw;
  3838. #if HWY_TARGET <= HWY_SSE2
  3839. // On SSE2/SSSE3, demote from int16_t to TFromV<V> to avoid unnecessary
  3840. // overhead
  3841. const RebindToSigned<decltype(dw)> dw_i;
  3842. #else
  3843. // On other targets, demote from MakeWide<TFromV<V>> to TFromV<V>
  3844. const decltype(dw) dw_i;
  3845. #endif
  3846. return OrderedDemote2To(
  3847. d, BitCast(dw_i, IntDiv<1>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b))),
  3848. BitCast(dw_i, IntDiv<1>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b))));
  3849. }
  3850. #endif // !HWY_HAVE_FLOAT16
  3851. template <size_t kOrigLaneSize, class V,
  3852. HWY_IF_T_SIZE_ONE_OF_V(V,
  3853. (HWY_HAVE_FLOAT64 ? 0 : (1 << 4)) | (1 << 8))>
  3854. HWY_INLINE V IntDiv(V a, V b) {
  3855. return IntDivUsingFloatDiv<kOrigLaneSize>(a, b);
  3856. }
  3857. #if HWY_HAVE_FLOAT64
  3858. template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
  3859. HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
  3860. HWY_INLINE V IntDiv(V a, V b) {
  3861. const DFromV<decltype(a)> d;
  3862. const Rebind<double, decltype(d)> df64;
  3863. // It is okay to demote the F64 Div result to int32_t or uint32_t using
  3864. // DemoteInRangeTo as static_cast<double>(a[i]) / static_cast<double>(b[i])
  3865. // will always be within the range of TFromV<V> if b[i] != 0 and
  3866. // sizeof(TFromV<V>) <= 4.
  3867. return DemoteInRangeTo(d, Div(PromoteTo(df64, a), PromoteTo(df64, b)));
  3868. }
  3869. template <size_t kOrigLaneSize, class V, HWY_IF_UI32(TFromV<V>),
  3870. HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
  3871. HWY_INLINE V IntDiv(V a, V b) {
  3872. const DFromV<decltype(a)> d;
  3873. const Half<decltype(d)> dh;
  3874. const Repartition<double, decltype(d)> df64;
  3875. // It is okay to demote the F64 Div result to int32_t or uint32_t using
  3876. // DemoteInRangeTo as static_cast<double>(a[i]) / static_cast<double>(b[i])
  3877. // will always be within the range of TFromV<V> if b[i] != 0 and
  3878. // sizeof(TFromV<V>) <= 4.
  3879. const VFromD<decltype(df64)> div1 =
  3880. Div(PromoteUpperTo(df64, a), PromoteUpperTo(df64, b));
  3881. const VFromD<decltype(df64)> div0 =
  3882. Div(PromoteLowerTo(df64, a), PromoteLowerTo(df64, b));
  3883. return Combine(d, DemoteInRangeTo(dh, div1), DemoteInRangeTo(dh, div0));
  3884. }
  3885. #endif // HWY_HAVE_FLOAT64
  3886. template <size_t kOrigLaneSize, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
  3887. HWY_IF_T_SIZE_ONE_OF_V(V, ((HWY_TARGET <= HWY_SSE2 ||
  3888. HWY_TARGET == HWY_WASM ||
  3889. HWY_TARGET == HWY_WASM_EMU256)
  3890. ? 0
  3891. : (1 << 1)) |
  3892. (1 << 2) | (1 << 4) | (1 << 8))>
  3893. HWY_INLINE V IntMod(V a, V b) {
  3894. return hwy::HWY_NAMESPACE::NegMulAdd(IntDiv<kOrigLaneSize>(a, b), b, a);
  3895. }
  3896. #if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
  3897. HWY_TARGET == HWY_WASM_EMU256
  3898. template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
  3899. HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)>
  3900. HWY_INLINE V IntMod(V a, V b) {
  3901. const DFromV<decltype(a)> d;
  3902. const Rebind<MakeWide<TFromV<V>>, decltype(d)> dw;
  3903. return DemoteTo(d, IntMod<kOrigLaneSize>(PromoteTo(dw, a), PromoteTo(dw, b)));
  3904. }
  3905. template <size_t kOrigLaneSize, class V, HWY_IF_UI8(TFromV<V>),
  3906. HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)>
  3907. HWY_INLINE V IntMod(V a, V b) {
  3908. const DFromV<decltype(a)> d;
  3909. const RepartitionToWide<decltype(d)> dw;
  3910. return OrderedDemote2To(
  3911. d, IntMod<kOrigLaneSize>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b)),
  3912. IntMod<kOrigLaneSize>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b)));
  3913. }
  3914. #endif // HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || HWY_TARGET ==
  3915. // HWY_WASM_EMU256
  3916. } // namespace detail
  3917. #if HWY_TARGET == HWY_SCALAR
  3918. template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  3919. HWY_API Vec1<T> operator/(Vec1<T> a, Vec1<T> b) {
  3920. return detail::IntDiv<sizeof(T)>(a, b);
  3921. }
  3922. template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  3923. HWY_API Vec1<T> operator%(Vec1<T> a, Vec1<T> b) {
  3924. return detail::IntMod<sizeof(T)>(a, b);
  3925. }
  3926. #else // HWY_TARGET != HWY_SCALAR
  3927. template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  3928. HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
  3929. return detail::IntDiv<sizeof(T)>(a, b);
  3930. }
  3931. template <class T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  3932. HWY_API Vec128<T, N> operator%(Vec128<T, N> a, Vec128<T, N> b) {
  3933. return detail::IntMod<sizeof(T)>(a, b);
  3934. }
  3935. #if HWY_CAP_GE256
  3936. template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  3937. HWY_API Vec256<T> operator/(Vec256<T> a, Vec256<T> b) {
  3938. return detail::IntDiv<sizeof(T)>(a, b);
  3939. }
  3940. template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  3941. HWY_API Vec256<T> operator%(Vec256<T> a, Vec256<T> b) {
  3942. return detail::IntMod<sizeof(T)>(a, b);
  3943. }
  3944. #endif
  3945. #if HWY_CAP_GE512
  3946. template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  3947. HWY_API Vec512<T> operator/(Vec512<T> a, Vec512<T> b) {
  3948. return detail::IntDiv<sizeof(T)>(a, b);
  3949. }
  3950. template <class T, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
  3951. HWY_API Vec512<T> operator%(Vec512<T> a, Vec512<T> b) {
  3952. return detail::IntMod<sizeof(T)>(a, b);
  3953. }
  3954. #endif
  3955. #endif // HWY_TARGET == HWY_SCALAR
  3956. #endif // HWY_NATIVE_INT_DIV
  3957. // ------------------------------ MulEvenAdd (PromoteEvenTo)
  3958. // SVE with bf16 and NEON with bf16 override this.
  3959. #if (defined(HWY_NATIVE_MUL_EVEN_BF16) == defined(HWY_TARGET_TOGGLE))
  3960. #ifdef HWY_NATIVE_MUL_EVEN_BF16
  3961. #undef HWY_NATIVE_MUL_EVEN_BF16
  3962. #else
  3963. #define HWY_NATIVE_MUL_EVEN_BF16
  3964. #endif
  3965. template <class DF, HWY_IF_F32_D(DF),
  3966. class VBF = VFromD<Repartition<bfloat16_t, DF>>>
  3967. HWY_API VFromD<DF> MulEvenAdd(DF df, VBF a, VBF b, VFromD<DF> c) {
  3968. return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), c);
  3969. }
  3970. template <class DF, HWY_IF_F32_D(DF),
  3971. class VBF = VFromD<Repartition<bfloat16_t, DF>>>
  3972. HWY_API VFromD<DF> MulOddAdd(DF df, VBF a, VBF b, VFromD<DF> c) {
  3973. return MulAdd(PromoteOddTo(df, a), PromoteOddTo(df, b), c);
  3974. }
  3975. #endif // HWY_NATIVE_MUL_EVEN_BF16
  3976. // ------------------------------ ReorderWidenMulAccumulate (MulEvenAdd)
  3977. // AVX3_SPR/ZEN4, and NEON with bf16 but not(!) SVE override this.
  3978. #if (defined(HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16) == \
  3979. defined(HWY_TARGET_TOGGLE))
  3980. #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
  3981. #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
  3982. #else
  3983. #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
  3984. #endif
  3985. template <class DF, HWY_IF_F32_D(DF),
  3986. class VBF = VFromD<Repartition<bfloat16_t, DF>>>
  3987. HWY_API VFromD<DF> ReorderWidenMulAccumulate(DF df, VBF a, VBF b,
  3988. VFromD<DF> sum0,
  3989. VFromD<DF>& sum1) {
  3990. // Lane order within sum0/1 is undefined, hence we can avoid the
  3991. // longer-latency lane-crossing PromoteTo by using PromoteEvenTo.
  3992. sum1 = MulOddAdd(df, a, b, sum1);
  3993. return MulEvenAdd(df, a, b, sum0);
  3994. }
  3995. #endif // HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
  3996. // ------------------------------ WidenMulAccumulate
  3997. #if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE) == defined(HWY_TARGET_TOGGLE))
  3998. #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
  3999. #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
  4000. #else
  4001. #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE
  4002. #endif
  4003. template<class D, HWY_IF_INTEGER(TFromD<D>),
  4004. class DN = RepartitionToNarrow<D>>
  4005. HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
  4006. VFromD<D> low, VFromD<D>& high) {
  4007. high = MulAdd(PromoteUpperTo(d, mul), PromoteUpperTo(d, x), high);
  4008. return MulAdd(PromoteLowerTo(d, mul), PromoteLowerTo(d, x), low);
  4009. }
  4010. #endif // HWY_NATIVE_WIDEN_MUL_ACCUMULATE
  4011. #if 0
  4012. #if (defined(HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16) == defined(HWY_TARGET_TOGGLE))
  4013. #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
  4014. #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
  4015. #else
  4016. #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
  4017. #endif
  4018. #if HWY_HAVE_FLOAT16
  4019. template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>>
  4020. HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
  4021. VFromD<D> low, VFromD<D>& high) {
  4022. high = MulAdd(PromoteUpperTo(d, mul), PromoteUpperTo(d, x), high);
  4023. return MulAdd(PromoteLowerTo(d, mul), PromoteLowerTo(d, x), low);
  4024. }
  4025. #endif // HWY_HAVE_FLOAT16
  4026. #endif // HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
  4027. #endif // #if 0
  4028. // ------------------------------ SatWidenMulPairwiseAdd
  4029. #if (defined(HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD) == \
  4030. defined(HWY_TARGET_TOGGLE))
  4031. #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
  4032. #undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
  4033. #else
  4034. #define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD
  4035. #endif
  4036. template <class DI16, class VU8, class VI8,
  4037. class VU8_2 = Vec<Repartition<uint8_t, DI16>>, HWY_IF_I16_D(DI16),
  4038. HWY_IF_U8_D(DFromV<VU8>), HWY_IF_I8_D(DFromV<VI8>),
  4039. HWY_IF_LANES_D(DFromV<VU8>, HWY_MAX_LANES_V(VI8)),
  4040. HWY_IF_LANES_D(DFromV<VU8>, HWY_MAX_LANES_V(VU8_2))>
  4041. HWY_API Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) {
  4042. const RebindToUnsigned<decltype(di16)> du16;
  4043. const auto a0 = BitCast(di16, PromoteEvenTo(du16, a));
  4044. const auto b0 = PromoteEvenTo(di16, b);
  4045. const auto a1 = BitCast(di16, PromoteOddTo(du16, a));
  4046. const auto b1 = PromoteOddTo(di16, b);
  4047. return SaturatedAdd(Mul(a0, b0), Mul(a1, b1));
  4048. }
  4049. #endif
  4050. // ------------------------------ SatWidenMulPairwiseAccumulate
  4051. #if (defined(HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM) == \
  4052. defined(HWY_TARGET_TOGGLE))
  4053. #ifdef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
  4054. #undef HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
  4055. #else
  4056. #define HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
  4057. #endif
  4058. template <class DI32, HWY_IF_I32_D(DI32)>
  4059. HWY_API VFromD<DI32> SatWidenMulPairwiseAccumulate(
  4060. DI32 di32, VFromD<Repartition<int16_t, DI32>> a,
  4061. VFromD<Repartition<int16_t, DI32>> b, VFromD<DI32> sum) {
  4062. // WidenMulPairwiseAdd(di32, a, b) is okay here as
  4063. // a[0]*b[0]+a[1]*b[1] is between -2147418112 and 2147483648 and as
  4064. // a[0]*b[0]+a[1]*b[1] can only overflow an int32_t if
  4065. // a[0], b[0], a[1], and b[1] are all equal to -32768.
  4066. const auto product = WidenMulPairwiseAdd(di32, a, b);
  4067. const auto mul_overflow =
  4068. VecFromMask(di32, Eq(product, Set(di32, LimitsMin<int32_t>())));
  4069. return SaturatedAdd(Sub(sum, And(BroadcastSignBit(sum), mul_overflow)),
  4070. Add(product, mul_overflow));
  4071. }
  4072. #endif // HWY_NATIVE_I16_I16_SATWIDENMULPAIRWISEACCUM
  4073. // ------------------------------ SatWidenMulAccumFixedPoint
  4074. #if (defined(HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT) == \
  4075. defined(HWY_TARGET_TOGGLE))
  4076. #ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
  4077. #undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
  4078. #else
  4079. #define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
  4080. #endif
  4081. template <class DI32, HWY_IF_I32_D(DI32)>
  4082. HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
  4083. VFromD<Rebind<int16_t, DI32>> a,
  4084. VFromD<Rebind<int16_t, DI32>> b,
  4085. VFromD<DI32> sum) {
  4086. const Repartition<int16_t, DI32> dt_i16;
  4087. const auto vt_a = ResizeBitCast(dt_i16, a);
  4088. const auto vt_b = ResizeBitCast(dt_i16, b);
  4089. const auto dup_a = InterleaveWholeLower(dt_i16, vt_a, vt_a);
  4090. const auto dup_b = InterleaveWholeLower(dt_i16, vt_b, vt_b);
  4091. return SatWidenMulPairwiseAccumulate(di32, dup_a, dup_b, sum);
  4092. }
  4093. #endif // HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
  4094. // ------------------------------ SumOfMulQuadAccumulate
  4095. #if (defined(HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE) == \
  4096. defined(HWY_TARGET_TOGGLE))
  4097. #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
  4098. #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
  4099. #else
  4100. #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
  4101. #endif
  4102. template <class DI32, HWY_IF_I32_D(DI32)>
  4103. HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 di32,
  4104. VFromD<Repartition<int8_t, DI32>> a,
  4105. VFromD<Repartition<int8_t, DI32>> b,
  4106. VFromD<DI32> sum) {
  4107. const Repartition<int16_t, decltype(di32)> di16;
  4108. const auto a0 = PromoteEvenTo(di16, a);
  4109. const auto b0 = PromoteEvenTo(di16, b);
  4110. const auto a1 = PromoteOddTo(di16, a);
  4111. const auto b1 = PromoteOddTo(di16, b);
  4112. return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0),
  4113. WidenMulPairwiseAdd(di32, a1, b1)));
  4114. }
  4115. #endif
  4116. #if (defined(HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE) == \
  4117. defined(HWY_TARGET_TOGGLE))
  4118. #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
  4119. #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
  4120. #else
  4121. #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
  4122. #endif
  4123. template <class DU32, HWY_IF_U32_D(DU32)>
  4124. HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
  4125. DU32 du32, VFromD<Repartition<uint8_t, DU32>> a,
  4126. VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
  4127. const Repartition<uint16_t, decltype(du32)> du16;
  4128. const RebindToSigned<decltype(du16)> di16;
  4129. const RebindToSigned<decltype(du32)> di32;
  4130. const auto lo8_mask = Set(di16, int16_t{0x00FF});
  4131. const auto a0 = And(BitCast(di16, a), lo8_mask);
  4132. const auto b0 = And(BitCast(di16, b), lo8_mask);
  4133. const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a)));
  4134. const auto b1 = BitCast(di16, ShiftRight<8>(BitCast(du16, b)));
  4135. return Add(sum, Add(BitCast(du32, WidenMulPairwiseAdd(di32, a0, b0)),
  4136. BitCast(du32, WidenMulPairwiseAdd(di32, a1, b1))));
  4137. }
  4138. #endif
  4139. #if (defined(HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE) == \
  4140. defined(HWY_TARGET_TOGGLE))
  4141. #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
  4142. #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
  4143. #else
  4144. #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
  4145. #endif
  4146. template <class DI32, HWY_IF_I32_D(DI32)>
  4147. HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
  4148. DI32 di32, VFromD<Repartition<uint8_t, DI32>> a_u,
  4149. VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
  4150. const Repartition<int16_t, decltype(di32)> di16;
  4151. const RebindToUnsigned<decltype(di16)> du16;
  4152. const auto a0 = And(BitCast(di16, a_u), Set(di16, int16_t{0x00FF}));
  4153. const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b_i)));
  4154. const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a_u)));
  4155. const auto b1 = ShiftRight<8>(BitCast(di16, b_i));
  4156. // NOTE: SatWidenMulPairwiseAdd(di16, a_u, b_i) cannot be used in
  4157. // SumOfMulQuadAccumulate as it is possible for
  4158. // a_u[0]*b_i[0]+a_u[1]*b_i[1] to overflow an int16_t if a_u[0], b_i[0],
  4159. // a_u[1], and b_i[1] are all non-zero and b_i[0] and b_i[1] have the same
  4160. // sign.
  4161. return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0),
  4162. WidenMulPairwiseAdd(di32, a1, b1)));
  4163. }
  4164. #endif
  4165. #if (defined(HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE) == \
  4166. defined(HWY_TARGET_TOGGLE))
  4167. #ifdef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
  4168. #undef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
  4169. #else
  4170. #define HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
  4171. #endif
  4172. #if HWY_HAVE_INTEGER64
  4173. template <class DI64, HWY_IF_I64_D(DI64)>
  4174. HWY_API VFromD<DI64> SumOfMulQuadAccumulate(
  4175. DI64 di64, VFromD<Repartition<int16_t, DI64>> a,
  4176. VFromD<Repartition<int16_t, DI64>> b, VFromD<DI64> sum) {
  4177. const Repartition<int32_t, decltype(di64)> di32;
  4178. // WidenMulPairwiseAdd(di32, a, b) is okay here as
  4179. // a[0]*b[0]+a[1]*b[1] is between -2147418112 and 2147483648 and as
  4180. // a[0]*b[0]+a[1]*b[1] can only overflow an int32_t if
  4181. // a[0], b[0], a[1], and b[1] are all equal to -32768.
  4182. const auto i32_pairwise_sum = WidenMulPairwiseAdd(di32, a, b);
  4183. const auto i32_pairwise_sum_overflow =
  4184. VecFromMask(di32, Eq(i32_pairwise_sum, Set(di32, LimitsMin<int32_t>())));
  4185. // The upper 32 bits of sum0 and sum1 need to be zeroed out in the case of
  4186. // overflow.
  4187. const auto hi32_mask = Set(di64, static_cast<int64_t>(~int64_t{0xFFFFFFFF}));
  4188. const auto p0_zero_out_mask =
  4189. ShiftLeft<32>(BitCast(di64, i32_pairwise_sum_overflow));
  4190. const auto p1_zero_out_mask =
  4191. And(BitCast(di64, i32_pairwise_sum_overflow), hi32_mask);
  4192. const auto p0 =
  4193. AndNot(p0_zero_out_mask,
  4194. ShiftRight<32>(ShiftLeft<32>(BitCast(di64, i32_pairwise_sum))));
  4195. const auto p1 =
  4196. AndNot(p1_zero_out_mask, ShiftRight<32>(BitCast(di64, i32_pairwise_sum)));
  4197. return Add(sum, Add(p0, p1));
  4198. }
  4199. #endif // HWY_HAVE_INTEGER64
  4200. #endif // HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE
  4201. #if (defined(HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE) == \
  4202. defined(HWY_TARGET_TOGGLE))
  4203. #ifdef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
  4204. #undef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
  4205. #else
  4206. #define HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
  4207. #endif
  4208. #if HWY_HAVE_INTEGER64
  4209. template <class DU64, HWY_IF_U64_D(DU64)>
  4210. HWY_API VFromD<DU64> SumOfMulQuadAccumulate(
  4211. DU64 du64, VFromD<Repartition<uint16_t, DU64>> a,
  4212. VFromD<Repartition<uint16_t, DU64>> b, VFromD<DU64> sum) {
  4213. const auto u32_even_prod = MulEven(a, b);
  4214. const auto u32_odd_prod = MulOdd(a, b);
  4215. const auto p0 = Add(PromoteEvenTo(du64, u32_even_prod),
  4216. PromoteEvenTo(du64, u32_odd_prod));
  4217. const auto p1 =
  4218. Add(PromoteOddTo(du64, u32_even_prod), PromoteOddTo(du64, u32_odd_prod));
  4219. return Add(sum, Add(p0, p1));
  4220. }
  4221. #endif // HWY_HAVE_INTEGER64
  4222. #endif // HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE
  4223. // ------------------------------ F64 ApproximateReciprocal
  4224. #if (defined(HWY_NATIVE_F64_APPROX_RECIP) == defined(HWY_TARGET_TOGGLE))
  4225. #ifdef HWY_NATIVE_F64_APPROX_RECIP
  4226. #undef HWY_NATIVE_F64_APPROX_RECIP
  4227. #else
  4228. #define HWY_NATIVE_F64_APPROX_RECIP
  4229. #endif
  4230. #if HWY_HAVE_FLOAT64
  4231. template <class V, HWY_IF_F64_D(DFromV<V>)>
  4232. HWY_API V ApproximateReciprocal(V v) {
  4233. const DFromV<decltype(v)> d;
  4234. return Div(Set(d, 1.0), v);
  4235. }
  4236. #endif // HWY_HAVE_FLOAT64
  4237. #endif // HWY_NATIVE_F64_APPROX_RECIP
  4238. // ------------------------------ F64 ApproximateReciprocalSqrt
  4239. #if (defined(HWY_NATIVE_F64_APPROX_RSQRT) == defined(HWY_TARGET_TOGGLE))
  4240. #ifdef HWY_NATIVE_F64_APPROX_RSQRT
  4241. #undef HWY_NATIVE_F64_APPROX_RSQRT
  4242. #else
  4243. #define HWY_NATIVE_F64_APPROX_RSQRT
  4244. #endif
  4245. #if HWY_HAVE_FLOAT64
  4246. template <class V, HWY_IF_F64_D(DFromV<V>)>
  4247. HWY_API V ApproximateReciprocalSqrt(V v) {
  4248. const DFromV<decltype(v)> d;
  4249. const RebindToUnsigned<decltype(d)> du;
  4250. const auto half = Mul(v, Set(d, 0.5));
  4251. // Initial guess based on log2(f)
  4252. const auto guess = BitCast(d, Sub(Set(du, uint64_t{0x5FE6EB50C7B537A9u}),
  4253. ShiftRight<1>(BitCast(du, v))));
  4254. // One Newton-Raphson iteration
  4255. return Mul(guess, NegMulAdd(Mul(half, guess), guess, Set(d, 1.5)));
  4256. }
  4257. #endif // HWY_HAVE_FLOAT64
  4258. #endif // HWY_NATIVE_F64_APPROX_RSQRT
  4259. // ------------------------------ Compress*
  4260. #if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE))
  4261. #ifdef HWY_NATIVE_COMPRESS8
  4262. #undef HWY_NATIVE_COMPRESS8
  4263. #else
  4264. #define HWY_NATIVE_COMPRESS8
  4265. #endif
  4266. template <class V, class D, typename T, HWY_IF_T_SIZE(T, 1)>
  4267. HWY_API size_t CompressBitsStore(V v, const uint8_t* HWY_RESTRICT bits, D d,
  4268. T* unaligned) {
  4269. HWY_ALIGN T lanes[MaxLanes(d)];
  4270. Store(v, d, lanes);
  4271. const Simd<T, HWY_MIN(MaxLanes(d), 8), 0> d8;
  4272. T* HWY_RESTRICT pos = unaligned;
  4273. HWY_ALIGN constexpr T table[2048] = {
  4274. 0, 1, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, //
  4275. 1, 0, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, //
  4276. 2, 0, 1, 3, 4, 5, 6, 7, /**/ 0, 2, 1, 3, 4, 5, 6, 7, //
  4277. 1, 2, 0, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, //
  4278. 3, 0, 1, 2, 4, 5, 6, 7, /**/ 0, 3, 1, 2, 4, 5, 6, 7, //
  4279. 1, 3, 0, 2, 4, 5, 6, 7, /**/ 0, 1, 3, 2, 4, 5, 6, 7, //
  4280. 2, 3, 0, 1, 4, 5, 6, 7, /**/ 0, 2, 3, 1, 4, 5, 6, 7, //
  4281. 1, 2, 3, 0, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, //
  4282. 4, 0, 1, 2, 3, 5, 6, 7, /**/ 0, 4, 1, 2, 3, 5, 6, 7, //
  4283. 1, 4, 0, 2, 3, 5, 6, 7, /**/ 0, 1, 4, 2, 3, 5, 6, 7, //
  4284. 2, 4, 0, 1, 3, 5, 6, 7, /**/ 0, 2, 4, 1, 3, 5, 6, 7, //
  4285. 1, 2, 4, 0, 3, 5, 6, 7, /**/ 0, 1, 2, 4, 3, 5, 6, 7, //
  4286. 3, 4, 0, 1, 2, 5, 6, 7, /**/ 0, 3, 4, 1, 2, 5, 6, 7, //
  4287. 1, 3, 4, 0, 2, 5, 6, 7, /**/ 0, 1, 3, 4, 2, 5, 6, 7, //
  4288. 2, 3, 4, 0, 1, 5, 6, 7, /**/ 0, 2, 3, 4, 1, 5, 6, 7, //
  4289. 1, 2, 3, 4, 0, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, //
  4290. 5, 0, 1, 2, 3, 4, 6, 7, /**/ 0, 5, 1, 2, 3, 4, 6, 7, //
  4291. 1, 5, 0, 2, 3, 4, 6, 7, /**/ 0, 1, 5, 2, 3, 4, 6, 7, //
  4292. 2, 5, 0, 1, 3, 4, 6, 7, /**/ 0, 2, 5, 1, 3, 4, 6, 7, //
  4293. 1, 2, 5, 0, 3, 4, 6, 7, /**/ 0, 1, 2, 5, 3, 4, 6, 7, //
  4294. 3, 5, 0, 1, 2, 4, 6, 7, /**/ 0, 3, 5, 1, 2, 4, 6, 7, //
  4295. 1, 3, 5, 0, 2, 4, 6, 7, /**/ 0, 1, 3, 5, 2, 4, 6, 7, //
  4296. 2, 3, 5, 0, 1, 4, 6, 7, /**/ 0, 2, 3, 5, 1, 4, 6, 7, //
  4297. 1, 2, 3, 5, 0, 4, 6, 7, /**/ 0, 1, 2, 3, 5, 4, 6, 7, //
  4298. 4, 5, 0, 1, 2, 3, 6, 7, /**/ 0, 4, 5, 1, 2, 3, 6, 7, //
  4299. 1, 4, 5, 0, 2, 3, 6, 7, /**/ 0, 1, 4, 5, 2, 3, 6, 7, //
  4300. 2, 4, 5, 0, 1, 3, 6, 7, /**/ 0, 2, 4, 5, 1, 3, 6, 7, //
  4301. 1, 2, 4, 5, 0, 3, 6, 7, /**/ 0, 1, 2, 4, 5, 3, 6, 7, //
  4302. 3, 4, 5, 0, 1, 2, 6, 7, /**/ 0, 3, 4, 5, 1, 2, 6, 7, //
  4303. 1, 3, 4, 5, 0, 2, 6, 7, /**/ 0, 1, 3, 4, 5, 2, 6, 7, //
  4304. 2, 3, 4, 5, 0, 1, 6, 7, /**/ 0, 2, 3, 4, 5, 1, 6, 7, //
  4305. 1, 2, 3, 4, 5, 0, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, //
  4306. 6, 0, 1, 2, 3, 4, 5, 7, /**/ 0, 6, 1, 2, 3, 4, 5, 7, //
  4307. 1, 6, 0, 2, 3, 4, 5, 7, /**/ 0, 1, 6, 2, 3, 4, 5, 7, //
  4308. 2, 6, 0, 1, 3, 4, 5, 7, /**/ 0, 2, 6, 1, 3, 4, 5, 7, //
  4309. 1, 2, 6, 0, 3, 4, 5, 7, /**/ 0, 1, 2, 6, 3, 4, 5, 7, //
  4310. 3, 6, 0, 1, 2, 4, 5, 7, /**/ 0, 3, 6, 1, 2, 4, 5, 7, //
  4311. 1, 3, 6, 0, 2, 4, 5, 7, /**/ 0, 1, 3, 6, 2, 4, 5, 7, //
  4312. 2, 3, 6, 0, 1, 4, 5, 7, /**/ 0, 2, 3, 6, 1, 4, 5, 7, //
  4313. 1, 2, 3, 6, 0, 4, 5, 7, /**/ 0, 1, 2, 3, 6, 4, 5, 7, //
  4314. 4, 6, 0, 1, 2, 3, 5, 7, /**/ 0, 4, 6, 1, 2, 3, 5, 7, //
  4315. 1, 4, 6, 0, 2, 3, 5, 7, /**/ 0, 1, 4, 6, 2, 3, 5, 7, //
  4316. 2, 4, 6, 0, 1, 3, 5, 7, /**/ 0, 2, 4, 6, 1, 3, 5, 7, //
  4317. 1, 2, 4, 6, 0, 3, 5, 7, /**/ 0, 1, 2, 4, 6, 3, 5, 7, //
  4318. 3, 4, 6, 0, 1, 2, 5, 7, /**/ 0, 3, 4, 6, 1, 2, 5, 7, //
  4319. 1, 3, 4, 6, 0, 2, 5, 7, /**/ 0, 1, 3, 4, 6, 2, 5, 7, //
  4320. 2, 3, 4, 6, 0, 1, 5, 7, /**/ 0, 2, 3, 4, 6, 1, 5, 7, //
  4321. 1, 2, 3, 4, 6, 0, 5, 7, /**/ 0, 1, 2, 3, 4, 6, 5, 7, //
  4322. 5, 6, 0, 1, 2, 3, 4, 7, /**/ 0, 5, 6, 1, 2, 3, 4, 7, //
  4323. 1, 5, 6, 0, 2, 3, 4, 7, /**/ 0, 1, 5, 6, 2, 3, 4, 7, //
  4324. 2, 5, 6, 0, 1, 3, 4, 7, /**/ 0, 2, 5, 6, 1, 3, 4, 7, //
  4325. 1, 2, 5, 6, 0, 3, 4, 7, /**/ 0, 1, 2, 5, 6, 3, 4, 7, //
  4326. 3, 5, 6, 0, 1, 2, 4, 7, /**/ 0, 3, 5, 6, 1, 2, 4, 7, //
  4327. 1, 3, 5, 6, 0, 2, 4, 7, /**/ 0, 1, 3, 5, 6, 2, 4, 7, //
  4328. 2, 3, 5, 6, 0, 1, 4, 7, /**/ 0, 2, 3, 5, 6, 1, 4, 7, //
  4329. 1, 2, 3, 5, 6, 0, 4, 7, /**/ 0, 1, 2, 3, 5, 6, 4, 7, //
  4330. 4, 5, 6, 0, 1, 2, 3, 7, /**/ 0, 4, 5, 6, 1, 2, 3, 7, //
  4331. 1, 4, 5, 6, 0, 2, 3, 7, /**/ 0, 1, 4, 5, 6, 2, 3, 7, //
  4332. 2, 4, 5, 6, 0, 1, 3, 7, /**/ 0, 2, 4, 5, 6, 1, 3, 7, //
  4333. 1, 2, 4, 5, 6, 0, 3, 7, /**/ 0, 1, 2, 4, 5, 6, 3, 7, //
  4334. 3, 4, 5, 6, 0, 1, 2, 7, /**/ 0, 3, 4, 5, 6, 1, 2, 7, //
  4335. 1, 3, 4, 5, 6, 0, 2, 7, /**/ 0, 1, 3, 4, 5, 6, 2, 7, //
  4336. 2, 3, 4, 5, 6, 0, 1, 7, /**/ 0, 2, 3, 4, 5, 6, 1, 7, //
  4337. 1, 2, 3, 4, 5, 6, 0, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, //
  4338. 7, 0, 1, 2, 3, 4, 5, 6, /**/ 0, 7, 1, 2, 3, 4, 5, 6, //
  4339. 1, 7, 0, 2, 3, 4, 5, 6, /**/ 0, 1, 7, 2, 3, 4, 5, 6, //
  4340. 2, 7, 0, 1, 3, 4, 5, 6, /**/ 0, 2, 7, 1, 3, 4, 5, 6, //
  4341. 1, 2, 7, 0, 3, 4, 5, 6, /**/ 0, 1, 2, 7, 3, 4, 5, 6, //
  4342. 3, 7, 0, 1, 2, 4, 5, 6, /**/ 0, 3, 7, 1, 2, 4, 5, 6, //
  4343. 1, 3, 7, 0, 2, 4, 5, 6, /**/ 0, 1, 3, 7, 2, 4, 5, 6, //
  4344. 2, 3, 7, 0, 1, 4, 5, 6, /**/ 0, 2, 3, 7, 1, 4, 5, 6, //
  4345. 1, 2, 3, 7, 0, 4, 5, 6, /**/ 0, 1, 2, 3, 7, 4, 5, 6, //
  4346. 4, 7, 0, 1, 2, 3, 5, 6, /**/ 0, 4, 7, 1, 2, 3, 5, 6, //
  4347. 1, 4, 7, 0, 2, 3, 5, 6, /**/ 0, 1, 4, 7, 2, 3, 5, 6, //
  4348. 2, 4, 7, 0, 1, 3, 5, 6, /**/ 0, 2, 4, 7, 1, 3, 5, 6, //
  4349. 1, 2, 4, 7, 0, 3, 5, 6, /**/ 0, 1, 2, 4, 7, 3, 5, 6, //
  4350. 3, 4, 7, 0, 1, 2, 5, 6, /**/ 0, 3, 4, 7, 1, 2, 5, 6, //
  4351. 1, 3, 4, 7, 0, 2, 5, 6, /**/ 0, 1, 3, 4, 7, 2, 5, 6, //
  4352. 2, 3, 4, 7, 0, 1, 5, 6, /**/ 0, 2, 3, 4, 7, 1, 5, 6, //
  4353. 1, 2, 3, 4, 7, 0, 5, 6, /**/ 0, 1, 2, 3, 4, 7, 5, 6, //
  4354. 5, 7, 0, 1, 2, 3, 4, 6, /**/ 0, 5, 7, 1, 2, 3, 4, 6, //
  4355. 1, 5, 7, 0, 2, 3, 4, 6, /**/ 0, 1, 5, 7, 2, 3, 4, 6, //
  4356. 2, 5, 7, 0, 1, 3, 4, 6, /**/ 0, 2, 5, 7, 1, 3, 4, 6, //
  4357. 1, 2, 5, 7, 0, 3, 4, 6, /**/ 0, 1, 2, 5, 7, 3, 4, 6, //
  4358. 3, 5, 7, 0, 1, 2, 4, 6, /**/ 0, 3, 5, 7, 1, 2, 4, 6, //
  4359. 1, 3, 5, 7, 0, 2, 4, 6, /**/ 0, 1, 3, 5, 7, 2, 4, 6, //
  4360. 2, 3, 5, 7, 0, 1, 4, 6, /**/ 0, 2, 3, 5, 7, 1, 4, 6, //
  4361. 1, 2, 3, 5, 7, 0, 4, 6, /**/ 0, 1, 2, 3, 5, 7, 4, 6, //
  4362. 4, 5, 7, 0, 1, 2, 3, 6, /**/ 0, 4, 5, 7, 1, 2, 3, 6, //
  4363. 1, 4, 5, 7, 0, 2, 3, 6, /**/ 0, 1, 4, 5, 7, 2, 3, 6, //
  4364. 2, 4, 5, 7, 0, 1, 3, 6, /**/ 0, 2, 4, 5, 7, 1, 3, 6, //
  4365. 1, 2, 4, 5, 7, 0, 3, 6, /**/ 0, 1, 2, 4, 5, 7, 3, 6, //
  4366. 3, 4, 5, 7, 0, 1, 2, 6, /**/ 0, 3, 4, 5, 7, 1, 2, 6, //
  4367. 1, 3, 4, 5, 7, 0, 2, 6, /**/ 0, 1, 3, 4, 5, 7, 2, 6, //
  4368. 2, 3, 4, 5, 7, 0, 1, 6, /**/ 0, 2, 3, 4, 5, 7, 1, 6, //
  4369. 1, 2, 3, 4, 5, 7, 0, 6, /**/ 0, 1, 2, 3, 4, 5, 7, 6, //
  4370. 6, 7, 0, 1, 2, 3, 4, 5, /**/ 0, 6, 7, 1, 2, 3, 4, 5, //
  4371. 1, 6, 7, 0, 2, 3, 4, 5, /**/ 0, 1, 6, 7, 2, 3, 4, 5, //
  4372. 2, 6, 7, 0, 1, 3, 4, 5, /**/ 0, 2, 6, 7, 1, 3, 4, 5, //
  4373. 1, 2, 6, 7, 0, 3, 4, 5, /**/ 0, 1, 2, 6, 7, 3, 4, 5, //
  4374. 3, 6, 7, 0, 1, 2, 4, 5, /**/ 0, 3, 6, 7, 1, 2, 4, 5, //
  4375. 1, 3, 6, 7, 0, 2, 4, 5, /**/ 0, 1, 3, 6, 7, 2, 4, 5, //
  4376. 2, 3, 6, 7, 0, 1, 4, 5, /**/ 0, 2, 3, 6, 7, 1, 4, 5, //
  4377. 1, 2, 3, 6, 7, 0, 4, 5, /**/ 0, 1, 2, 3, 6, 7, 4, 5, //
  4378. 4, 6, 7, 0, 1, 2, 3, 5, /**/ 0, 4, 6, 7, 1, 2, 3, 5, //
  4379. 1, 4, 6, 7, 0, 2, 3, 5, /**/ 0, 1, 4, 6, 7, 2, 3, 5, //
  4380. 2, 4, 6, 7, 0, 1, 3, 5, /**/ 0, 2, 4, 6, 7, 1, 3, 5, //
  4381. 1, 2, 4, 6, 7, 0, 3, 5, /**/ 0, 1, 2, 4, 6, 7, 3, 5, //
  4382. 3, 4, 6, 7, 0, 1, 2, 5, /**/ 0, 3, 4, 6, 7, 1, 2, 5, //
  4383. 1, 3, 4, 6, 7, 0, 2, 5, /**/ 0, 1, 3, 4, 6, 7, 2, 5, //
  4384. 2, 3, 4, 6, 7, 0, 1, 5, /**/ 0, 2, 3, 4, 6, 7, 1, 5, //
  4385. 1, 2, 3, 4, 6, 7, 0, 5, /**/ 0, 1, 2, 3, 4, 6, 7, 5, //
  4386. 5, 6, 7, 0, 1, 2, 3, 4, /**/ 0, 5, 6, 7, 1, 2, 3, 4, //
  4387. 1, 5, 6, 7, 0, 2, 3, 4, /**/ 0, 1, 5, 6, 7, 2, 3, 4, //
  4388. 2, 5, 6, 7, 0, 1, 3, 4, /**/ 0, 2, 5, 6, 7, 1, 3, 4, //
  4389. 1, 2, 5, 6, 7, 0, 3, 4, /**/ 0, 1, 2, 5, 6, 7, 3, 4, //
  4390. 3, 5, 6, 7, 0, 1, 2, 4, /**/ 0, 3, 5, 6, 7, 1, 2, 4, //
  4391. 1, 3, 5, 6, 7, 0, 2, 4, /**/ 0, 1, 3, 5, 6, 7, 2, 4, //
  4392. 2, 3, 5, 6, 7, 0, 1, 4, /**/ 0, 2, 3, 5, 6, 7, 1, 4, //
  4393. 1, 2, 3, 5, 6, 7, 0, 4, /**/ 0, 1, 2, 3, 5, 6, 7, 4, //
  4394. 4, 5, 6, 7, 0, 1, 2, 3, /**/ 0, 4, 5, 6, 7, 1, 2, 3, //
  4395. 1, 4, 5, 6, 7, 0, 2, 3, /**/ 0, 1, 4, 5, 6, 7, 2, 3, //
  4396. 2, 4, 5, 6, 7, 0, 1, 3, /**/ 0, 2, 4, 5, 6, 7, 1, 3, //
  4397. 1, 2, 4, 5, 6, 7, 0, 3, /**/ 0, 1, 2, 4, 5, 6, 7, 3, //
  4398. 3, 4, 5, 6, 7, 0, 1, 2, /**/ 0, 3, 4, 5, 6, 7, 1, 2, //
  4399. 1, 3, 4, 5, 6, 7, 0, 2, /**/ 0, 1, 3, 4, 5, 6, 7, 2, //
  4400. 2, 3, 4, 5, 6, 7, 0, 1, /**/ 0, 2, 3, 4, 5, 6, 7, 1, //
  4401. 1, 2, 3, 4, 5, 6, 7, 0, /**/ 0, 1, 2, 3, 4, 5, 6, 7};
  4402. for (size_t i = 0; i < Lanes(d); i += 8) {
  4403. // Each byte worth of bits is the index of one of 256 8-byte ranges, and its
  4404. // population count determines how far to advance the write position.
  4405. const size_t bits8 = bits[i / 8];
  4406. const auto indices = Load(d8, table + bits8 * 8);
  4407. const auto compressed = TableLookupBytes(LoadU(d8, lanes + i), indices);
  4408. StoreU(compressed, d8, pos);
  4409. pos += PopCount(bits8);
  4410. }
  4411. return static_cast<size_t>(pos - unaligned);
  4412. }
  4413. template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)>
  4414. HWY_API size_t CompressStore(V v, M mask, D d, T* HWY_RESTRICT unaligned) {
  4415. uint8_t bits[HWY_MAX(size_t{8}, MaxLanes(d) / 8)];
  4416. (void)StoreMaskBits(d, mask, bits);
  4417. return CompressBitsStore(v, bits, d, unaligned);
  4418. }
  4419. template <class V, class M, class D, typename T, HWY_IF_T_SIZE(T, 1)>
  4420. HWY_API size_t CompressBlendedStore(V v, M mask, D d,
  4421. T* HWY_RESTRICT unaligned) {
  4422. HWY_ALIGN T buf[MaxLanes(d)];
  4423. const size_t bytes = CompressStore(v, mask, d, buf);
  4424. BlendedStore(Load(d, buf), FirstN(d, bytes), d, unaligned);
  4425. return bytes;
  4426. }
  4427. // For reasons unknown, HWY_IF_T_SIZE_V is a compile error in SVE.
  4428. template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
  4429. HWY_API V Compress(V v, const M mask) {
  4430. const DFromV<V> d;
  4431. HWY_ALIGN T lanes[MaxLanes(d)];
  4432. (void)CompressStore(v, mask, d, lanes);
  4433. return Load(d, lanes);
  4434. }
  4435. template <class V, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
  4436. HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
  4437. const DFromV<V> d;
  4438. HWY_ALIGN T lanes[MaxLanes(d)];
  4439. (void)CompressBitsStore(v, bits, d, lanes);
  4440. return Load(d, lanes);
  4441. }
  4442. template <class V, class M, typename T = TFromV<V>, HWY_IF_T_SIZE(T, 1)>
  4443. HWY_API V CompressNot(V v, M mask) {
  4444. return Compress(v, Not(mask));
  4445. }
  4446. #endif // HWY_NATIVE_COMPRESS8
  4447. // ------------------------------ Expand
  4448. // Note that this generic implementation assumes <= 128 bit fixed vectors;
  4449. // the SVE and RVV targets provide their own native implementations.
  4450. #if (defined(HWY_NATIVE_EXPAND) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
  4451. #ifdef HWY_NATIVE_EXPAND
  4452. #undef HWY_NATIVE_EXPAND
  4453. #else
  4454. #define HWY_NATIVE_EXPAND
  4455. #endif
  4456. namespace detail {
  4457. #if HWY_IDE
  4458. template <class M>
  4459. HWY_INLINE uint64_t BitsFromMask(M /* mask */) {
  4460. return 0;
  4461. }
  4462. #endif // HWY_IDE
  4463. template <size_t N>
  4464. HWY_INLINE Vec128<uint8_t, N> IndicesForExpandFromBits(uint64_t mask_bits) {
  4465. static_assert(N <= 8, "Should only be called for half-vectors");
  4466. const Simd<uint8_t, N, 0> du8;
  4467. HWY_DASSERT(mask_bits < 0x100);
  4468. alignas(16) static constexpr uint8_t table[2048] = {
  4469. // PrintExpand8x8Tables
  4470. 128, 128, 128, 128, 128, 128, 128, 128, //
  4471. 0, 128, 128, 128, 128, 128, 128, 128, //
  4472. 128, 0, 128, 128, 128, 128, 128, 128, //
  4473. 0, 1, 128, 128, 128, 128, 128, 128, //
  4474. 128, 128, 0, 128, 128, 128, 128, 128, //
  4475. 0, 128, 1, 128, 128, 128, 128, 128, //
  4476. 128, 0, 1, 128, 128, 128, 128, 128, //
  4477. 0, 1, 2, 128, 128, 128, 128, 128, //
  4478. 128, 128, 128, 0, 128, 128, 128, 128, //
  4479. 0, 128, 128, 1, 128, 128, 128, 128, //
  4480. 128, 0, 128, 1, 128, 128, 128, 128, //
  4481. 0, 1, 128, 2, 128, 128, 128, 128, //
  4482. 128, 128, 0, 1, 128, 128, 128, 128, //
  4483. 0, 128, 1, 2, 128, 128, 128, 128, //
  4484. 128, 0, 1, 2, 128, 128, 128, 128, //
  4485. 0, 1, 2, 3, 128, 128, 128, 128, //
  4486. 128, 128, 128, 128, 0, 128, 128, 128, //
  4487. 0, 128, 128, 128, 1, 128, 128, 128, //
  4488. 128, 0, 128, 128, 1, 128, 128, 128, //
  4489. 0, 1, 128, 128, 2, 128, 128, 128, //
  4490. 128, 128, 0, 128, 1, 128, 128, 128, //
  4491. 0, 128, 1, 128, 2, 128, 128, 128, //
  4492. 128, 0, 1, 128, 2, 128, 128, 128, //
  4493. 0, 1, 2, 128, 3, 128, 128, 128, //
  4494. 128, 128, 128, 0, 1, 128, 128, 128, //
  4495. 0, 128, 128, 1, 2, 128, 128, 128, //
  4496. 128, 0, 128, 1, 2, 128, 128, 128, //
  4497. 0, 1, 128, 2, 3, 128, 128, 128, //
  4498. 128, 128, 0, 1, 2, 128, 128, 128, //
  4499. 0, 128, 1, 2, 3, 128, 128, 128, //
  4500. 128, 0, 1, 2, 3, 128, 128, 128, //
  4501. 0, 1, 2, 3, 4, 128, 128, 128, //
  4502. 128, 128, 128, 128, 128, 0, 128, 128, //
  4503. 0, 128, 128, 128, 128, 1, 128, 128, //
  4504. 128, 0, 128, 128, 128, 1, 128, 128, //
  4505. 0, 1, 128, 128, 128, 2, 128, 128, //
  4506. 128, 128, 0, 128, 128, 1, 128, 128, //
  4507. 0, 128, 1, 128, 128, 2, 128, 128, //
  4508. 128, 0, 1, 128, 128, 2, 128, 128, //
  4509. 0, 1, 2, 128, 128, 3, 128, 128, //
  4510. 128, 128, 128, 0, 128, 1, 128, 128, //
  4511. 0, 128, 128, 1, 128, 2, 128, 128, //
  4512. 128, 0, 128, 1, 128, 2, 128, 128, //
  4513. 0, 1, 128, 2, 128, 3, 128, 128, //
  4514. 128, 128, 0, 1, 128, 2, 128, 128, //
  4515. 0, 128, 1, 2, 128, 3, 128, 128, //
  4516. 128, 0, 1, 2, 128, 3, 128, 128, //
  4517. 0, 1, 2, 3, 128, 4, 128, 128, //
  4518. 128, 128, 128, 128, 0, 1, 128, 128, //
  4519. 0, 128, 128, 128, 1, 2, 128, 128, //
  4520. 128, 0, 128, 128, 1, 2, 128, 128, //
  4521. 0, 1, 128, 128, 2, 3, 128, 128, //
  4522. 128, 128, 0, 128, 1, 2, 128, 128, //
  4523. 0, 128, 1, 128, 2, 3, 128, 128, //
  4524. 128, 0, 1, 128, 2, 3, 128, 128, //
  4525. 0, 1, 2, 128, 3, 4, 128, 128, //
  4526. 128, 128, 128, 0, 1, 2, 128, 128, //
  4527. 0, 128, 128, 1, 2, 3, 128, 128, //
  4528. 128, 0, 128, 1, 2, 3, 128, 128, //
  4529. 0, 1, 128, 2, 3, 4, 128, 128, //
  4530. 128, 128, 0, 1, 2, 3, 128, 128, //
  4531. 0, 128, 1, 2, 3, 4, 128, 128, //
  4532. 128, 0, 1, 2, 3, 4, 128, 128, //
  4533. 0, 1, 2, 3, 4, 5, 128, 128, //
  4534. 128, 128, 128, 128, 128, 128, 0, 128, //
  4535. 0, 128, 128, 128, 128, 128, 1, 128, //
  4536. 128, 0, 128, 128, 128, 128, 1, 128, //
  4537. 0, 1, 128, 128, 128, 128, 2, 128, //
  4538. 128, 128, 0, 128, 128, 128, 1, 128, //
  4539. 0, 128, 1, 128, 128, 128, 2, 128, //
  4540. 128, 0, 1, 128, 128, 128, 2, 128, //
  4541. 0, 1, 2, 128, 128, 128, 3, 128, //
  4542. 128, 128, 128, 0, 128, 128, 1, 128, //
  4543. 0, 128, 128, 1, 128, 128, 2, 128, //
  4544. 128, 0, 128, 1, 128, 128, 2, 128, //
  4545. 0, 1, 128, 2, 128, 128, 3, 128, //
  4546. 128, 128, 0, 1, 128, 128, 2, 128, //
  4547. 0, 128, 1, 2, 128, 128, 3, 128, //
  4548. 128, 0, 1, 2, 128, 128, 3, 128, //
  4549. 0, 1, 2, 3, 128, 128, 4, 128, //
  4550. 128, 128, 128, 128, 0, 128, 1, 128, //
  4551. 0, 128, 128, 128, 1, 128, 2, 128, //
  4552. 128, 0, 128, 128, 1, 128, 2, 128, //
  4553. 0, 1, 128, 128, 2, 128, 3, 128, //
  4554. 128, 128, 0, 128, 1, 128, 2, 128, //
  4555. 0, 128, 1, 128, 2, 128, 3, 128, //
  4556. 128, 0, 1, 128, 2, 128, 3, 128, //
  4557. 0, 1, 2, 128, 3, 128, 4, 128, //
  4558. 128, 128, 128, 0, 1, 128, 2, 128, //
  4559. 0, 128, 128, 1, 2, 128, 3, 128, //
  4560. 128, 0, 128, 1, 2, 128, 3, 128, //
  4561. 0, 1, 128, 2, 3, 128, 4, 128, //
  4562. 128, 128, 0, 1, 2, 128, 3, 128, //
  4563. 0, 128, 1, 2, 3, 128, 4, 128, //
  4564. 128, 0, 1, 2, 3, 128, 4, 128, //
  4565. 0, 1, 2, 3, 4, 128, 5, 128, //
  4566. 128, 128, 128, 128, 128, 0, 1, 128, //
  4567. 0, 128, 128, 128, 128, 1, 2, 128, //
  4568. 128, 0, 128, 128, 128, 1, 2, 128, //
  4569. 0, 1, 128, 128, 128, 2, 3, 128, //
  4570. 128, 128, 0, 128, 128, 1, 2, 128, //
  4571. 0, 128, 1, 128, 128, 2, 3, 128, //
  4572. 128, 0, 1, 128, 128, 2, 3, 128, //
  4573. 0, 1, 2, 128, 128, 3, 4, 128, //
  4574. 128, 128, 128, 0, 128, 1, 2, 128, //
  4575. 0, 128, 128, 1, 128, 2, 3, 128, //
  4576. 128, 0, 128, 1, 128, 2, 3, 128, //
  4577. 0, 1, 128, 2, 128, 3, 4, 128, //
  4578. 128, 128, 0, 1, 128, 2, 3, 128, //
  4579. 0, 128, 1, 2, 128, 3, 4, 128, //
  4580. 128, 0, 1, 2, 128, 3, 4, 128, //
  4581. 0, 1, 2, 3, 128, 4, 5, 128, //
  4582. 128, 128, 128, 128, 0, 1, 2, 128, //
  4583. 0, 128, 128, 128, 1, 2, 3, 128, //
  4584. 128, 0, 128, 128, 1, 2, 3, 128, //
  4585. 0, 1, 128, 128, 2, 3, 4, 128, //
  4586. 128, 128, 0, 128, 1, 2, 3, 128, //
  4587. 0, 128, 1, 128, 2, 3, 4, 128, //
  4588. 128, 0, 1, 128, 2, 3, 4, 128, //
  4589. 0, 1, 2, 128, 3, 4, 5, 128, //
  4590. 128, 128, 128, 0, 1, 2, 3, 128, //
  4591. 0, 128, 128, 1, 2, 3, 4, 128, //
  4592. 128, 0, 128, 1, 2, 3, 4, 128, //
  4593. 0, 1, 128, 2, 3, 4, 5, 128, //
  4594. 128, 128, 0, 1, 2, 3, 4, 128, //
  4595. 0, 128, 1, 2, 3, 4, 5, 128, //
  4596. 128, 0, 1, 2, 3, 4, 5, 128, //
  4597. 0, 1, 2, 3, 4, 5, 6, 128, //
  4598. 128, 128, 128, 128, 128, 128, 128, 0, //
  4599. 0, 128, 128, 128, 128, 128, 128, 1, //
  4600. 128, 0, 128, 128, 128, 128, 128, 1, //
  4601. 0, 1, 128, 128, 128, 128, 128, 2, //
  4602. 128, 128, 0, 128, 128, 128, 128, 1, //
  4603. 0, 128, 1, 128, 128, 128, 128, 2, //
  4604. 128, 0, 1, 128, 128, 128, 128, 2, //
  4605. 0, 1, 2, 128, 128, 128, 128, 3, //
  4606. 128, 128, 128, 0, 128, 128, 128, 1, //
  4607. 0, 128, 128, 1, 128, 128, 128, 2, //
  4608. 128, 0, 128, 1, 128, 128, 128, 2, //
  4609. 0, 1, 128, 2, 128, 128, 128, 3, //
  4610. 128, 128, 0, 1, 128, 128, 128, 2, //
  4611. 0, 128, 1, 2, 128, 128, 128, 3, //
  4612. 128, 0, 1, 2, 128, 128, 128, 3, //
  4613. 0, 1, 2, 3, 128, 128, 128, 4, //
  4614. 128, 128, 128, 128, 0, 128, 128, 1, //
  4615. 0, 128, 128, 128, 1, 128, 128, 2, //
  4616. 128, 0, 128, 128, 1, 128, 128, 2, //
  4617. 0, 1, 128, 128, 2, 128, 128, 3, //
  4618. 128, 128, 0, 128, 1, 128, 128, 2, //
  4619. 0, 128, 1, 128, 2, 128, 128, 3, //
  4620. 128, 0, 1, 128, 2, 128, 128, 3, //
  4621. 0, 1, 2, 128, 3, 128, 128, 4, //
  4622. 128, 128, 128, 0, 1, 128, 128, 2, //
  4623. 0, 128, 128, 1, 2, 128, 128, 3, //
  4624. 128, 0, 128, 1, 2, 128, 128, 3, //
  4625. 0, 1, 128, 2, 3, 128, 128, 4, //
  4626. 128, 128, 0, 1, 2, 128, 128, 3, //
  4627. 0, 128, 1, 2, 3, 128, 128, 4, //
  4628. 128, 0, 1, 2, 3, 128, 128, 4, //
  4629. 0, 1, 2, 3, 4, 128, 128, 5, //
  4630. 128, 128, 128, 128, 128, 0, 128, 1, //
  4631. 0, 128, 128, 128, 128, 1, 128, 2, //
  4632. 128, 0, 128, 128, 128, 1, 128, 2, //
  4633. 0, 1, 128, 128, 128, 2, 128, 3, //
  4634. 128, 128, 0, 128, 128, 1, 128, 2, //
  4635. 0, 128, 1, 128, 128, 2, 128, 3, //
  4636. 128, 0, 1, 128, 128, 2, 128, 3, //
  4637. 0, 1, 2, 128, 128, 3, 128, 4, //
  4638. 128, 128, 128, 0, 128, 1, 128, 2, //
  4639. 0, 128, 128, 1, 128, 2, 128, 3, //
  4640. 128, 0, 128, 1, 128, 2, 128, 3, //
  4641. 0, 1, 128, 2, 128, 3, 128, 4, //
  4642. 128, 128, 0, 1, 128, 2, 128, 3, //
  4643. 0, 128, 1, 2, 128, 3, 128, 4, //
  4644. 128, 0, 1, 2, 128, 3, 128, 4, //
  4645. 0, 1, 2, 3, 128, 4, 128, 5, //
  4646. 128, 128, 128, 128, 0, 1, 128, 2, //
  4647. 0, 128, 128, 128, 1, 2, 128, 3, //
  4648. 128, 0, 128, 128, 1, 2, 128, 3, //
  4649. 0, 1, 128, 128, 2, 3, 128, 4, //
  4650. 128, 128, 0, 128, 1, 2, 128, 3, //
  4651. 0, 128, 1, 128, 2, 3, 128, 4, //
  4652. 128, 0, 1, 128, 2, 3, 128, 4, //
  4653. 0, 1, 2, 128, 3, 4, 128, 5, //
  4654. 128, 128, 128, 0, 1, 2, 128, 3, //
  4655. 0, 128, 128, 1, 2, 3, 128, 4, //
  4656. 128, 0, 128, 1, 2, 3, 128, 4, //
  4657. 0, 1, 128, 2, 3, 4, 128, 5, //
  4658. 128, 128, 0, 1, 2, 3, 128, 4, //
  4659. 0, 128, 1, 2, 3, 4, 128, 5, //
  4660. 128, 0, 1, 2, 3, 4, 128, 5, //
  4661. 0, 1, 2, 3, 4, 5, 128, 6, //
  4662. 128, 128, 128, 128, 128, 128, 0, 1, //
  4663. 0, 128, 128, 128, 128, 128, 1, 2, //
  4664. 128, 0, 128, 128, 128, 128, 1, 2, //
  4665. 0, 1, 128, 128, 128, 128, 2, 3, //
  4666. 128, 128, 0, 128, 128, 128, 1, 2, //
  4667. 0, 128, 1, 128, 128, 128, 2, 3, //
  4668. 128, 0, 1, 128, 128, 128, 2, 3, //
  4669. 0, 1, 2, 128, 128, 128, 3, 4, //
  4670. 128, 128, 128, 0, 128, 128, 1, 2, //
  4671. 0, 128, 128, 1, 128, 128, 2, 3, //
  4672. 128, 0, 128, 1, 128, 128, 2, 3, //
  4673. 0, 1, 128, 2, 128, 128, 3, 4, //
  4674. 128, 128, 0, 1, 128, 128, 2, 3, //
  4675. 0, 128, 1, 2, 128, 128, 3, 4, //
  4676. 128, 0, 1, 2, 128, 128, 3, 4, //
  4677. 0, 1, 2, 3, 128, 128, 4, 5, //
  4678. 128, 128, 128, 128, 0, 128, 1, 2, //
  4679. 0, 128, 128, 128, 1, 128, 2, 3, //
  4680. 128, 0, 128, 128, 1, 128, 2, 3, //
  4681. 0, 1, 128, 128, 2, 128, 3, 4, //
  4682. 128, 128, 0, 128, 1, 128, 2, 3, //
  4683. 0, 128, 1, 128, 2, 128, 3, 4, //
  4684. 128, 0, 1, 128, 2, 128, 3, 4, //
  4685. 0, 1, 2, 128, 3, 128, 4, 5, //
  4686. 128, 128, 128, 0, 1, 128, 2, 3, //
  4687. 0, 128, 128, 1, 2, 128, 3, 4, //
  4688. 128, 0, 128, 1, 2, 128, 3, 4, //
  4689. 0, 1, 128, 2, 3, 128, 4, 5, //
  4690. 128, 128, 0, 1, 2, 128, 3, 4, //
  4691. 0, 128, 1, 2, 3, 128, 4, 5, //
  4692. 128, 0, 1, 2, 3, 128, 4, 5, //
  4693. 0, 1, 2, 3, 4, 128, 5, 6, //
  4694. 128, 128, 128, 128, 128, 0, 1, 2, //
  4695. 0, 128, 128, 128, 128, 1, 2, 3, //
  4696. 128, 0, 128, 128, 128, 1, 2, 3, //
  4697. 0, 1, 128, 128, 128, 2, 3, 4, //
  4698. 128, 128, 0, 128, 128, 1, 2, 3, //
  4699. 0, 128, 1, 128, 128, 2, 3, 4, //
  4700. 128, 0, 1, 128, 128, 2, 3, 4, //
  4701. 0, 1, 2, 128, 128, 3, 4, 5, //
  4702. 128, 128, 128, 0, 128, 1, 2, 3, //
  4703. 0, 128, 128, 1, 128, 2, 3, 4, //
  4704. 128, 0, 128, 1, 128, 2, 3, 4, //
  4705. 0, 1, 128, 2, 128, 3, 4, 5, //
  4706. 128, 128, 0, 1, 128, 2, 3, 4, //
  4707. 0, 128, 1, 2, 128, 3, 4, 5, //
  4708. 128, 0, 1, 2, 128, 3, 4, 5, //
  4709. 0, 1, 2, 3, 128, 4, 5, 6, //
  4710. 128, 128, 128, 128, 0, 1, 2, 3, //
  4711. 0, 128, 128, 128, 1, 2, 3, 4, //
  4712. 128, 0, 128, 128, 1, 2, 3, 4, //
  4713. 0, 1, 128, 128, 2, 3, 4, 5, //
  4714. 128, 128, 0, 128, 1, 2, 3, 4, //
  4715. 0, 128, 1, 128, 2, 3, 4, 5, //
  4716. 128, 0, 1, 128, 2, 3, 4, 5, //
  4717. 0, 1, 2, 128, 3, 4, 5, 6, //
  4718. 128, 128, 128, 0, 1, 2, 3, 4, //
  4719. 0, 128, 128, 1, 2, 3, 4, 5, //
  4720. 128, 0, 128, 1, 2, 3, 4, 5, //
  4721. 0, 1, 128, 2, 3, 4, 5, 6, //
  4722. 128, 128, 0, 1, 2, 3, 4, 5, //
  4723. 0, 128, 1, 2, 3, 4, 5, 6, //
  4724. 128, 0, 1, 2, 3, 4, 5, 6, //
  4725. 0, 1, 2, 3, 4, 5, 6, 7};
  4726. return LoadU(du8, table + mask_bits * 8);
  4727. }
  4728. } // namespace detail
  4729. // Half vector of bytes: one table lookup
  4730. template <typename T, size_t N, HWY_IF_T_SIZE(T, 1), HWY_IF_V_SIZE_LE(T, N, 8)>
  4731. HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
  4732. const DFromV<decltype(v)> d;
  4733. const uint64_t mask_bits = detail::BitsFromMask(mask);
  4734. const Vec128<uint8_t, N> indices =
  4735. detail::IndicesForExpandFromBits<N>(mask_bits);
  4736. return BitCast(d, TableLookupBytesOr0(v, indices));
  4737. }
  4738. // Full vector of bytes: two table lookups
  4739. template <typename T, HWY_IF_T_SIZE(T, 1)>
  4740. HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) {
  4741. const Full128<T> d;
  4742. const RebindToUnsigned<decltype(d)> du;
  4743. const Half<decltype(du)> duh;
  4744. const Vec128<uint8_t> vu = BitCast(du, v);
  4745. const uint64_t mask_bits = detail::BitsFromMask(mask);
  4746. const uint64_t maskL = mask_bits & 0xFF;
  4747. const uint64_t maskH = mask_bits >> 8;
  4748. // We want to skip past the v bytes already consumed by idxL. There is no
  4749. // instruction for shift-reg by variable bytes. Storing v itself would work
  4750. // but would involve a store-load forwarding stall. We instead shuffle using
  4751. // loaded indices. multishift_epi64_epi8 would also help, but if we have that,
  4752. // we probably also have native 8-bit Expand.
  4753. alignas(16) static constexpr uint8_t iota[32] = {
  4754. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
  4755. 11, 12, 13, 14, 15, 128, 128, 128, 128, 128, 128,
  4756. 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
  4757. const VFromD<decltype(du)> shift = LoadU(du, iota + PopCount(maskL));
  4758. const VFromD<decltype(duh)> vL = LowerHalf(duh, vu);
  4759. const VFromD<decltype(duh)> vH =
  4760. LowerHalf(duh, TableLookupBytesOr0(vu, shift));
  4761. const VFromD<decltype(duh)> idxL = detail::IndicesForExpandFromBits<8>(maskL);
  4762. const VFromD<decltype(duh)> idxH = detail::IndicesForExpandFromBits<8>(maskH);
  4763. const VFromD<decltype(duh)> expandL = TableLookupBytesOr0(vL, idxL);
  4764. const VFromD<decltype(duh)> expandH = TableLookupBytesOr0(vH, idxH);
  4765. return BitCast(d, Combine(du, expandH, expandL));
  4766. }
  4767. template <typename T, size_t N, HWY_IF_T_SIZE(T, 2)>
  4768. HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
  4769. const DFromV<decltype(v)> d;
  4770. const RebindToUnsigned<decltype(d)> du;
  4771. const Rebind<uint8_t, decltype(d)> du8;
  4772. const uint64_t mask_bits = detail::BitsFromMask(mask);
  4773. // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply
  4774. // the nibble trick used below because not all indices fit within one lane.
  4775. alignas(16) static constexpr uint8_t table[2048] = {
  4776. // PrintExpand16x8ByteTables
  4777. 128, 128, 128, 128, 128, 128, 128, 128, //
  4778. 0, 128, 128, 128, 128, 128, 128, 128, //
  4779. 128, 0, 128, 128, 128, 128, 128, 128, //
  4780. 0, 2, 128, 128, 128, 128, 128, 128, //
  4781. 128, 128, 0, 128, 128, 128, 128, 128, //
  4782. 0, 128, 2, 128, 128, 128, 128, 128, //
  4783. 128, 0, 2, 128, 128, 128, 128, 128, //
  4784. 0, 2, 4, 128, 128, 128, 128, 128, //
  4785. 128, 128, 128, 0, 128, 128, 128, 128, //
  4786. 0, 128, 128, 2, 128, 128, 128, 128, //
  4787. 128, 0, 128, 2, 128, 128, 128, 128, //
  4788. 0, 2, 128, 4, 128, 128, 128, 128, //
  4789. 128, 128, 0, 2, 128, 128, 128, 128, //
  4790. 0, 128, 2, 4, 128, 128, 128, 128, //
  4791. 128, 0, 2, 4, 128, 128, 128, 128, //
  4792. 0, 2, 4, 6, 128, 128, 128, 128, //
  4793. 128, 128, 128, 128, 0, 128, 128, 128, //
  4794. 0, 128, 128, 128, 2, 128, 128, 128, //
  4795. 128, 0, 128, 128, 2, 128, 128, 128, //
  4796. 0, 2, 128, 128, 4, 128, 128, 128, //
  4797. 128, 128, 0, 128, 2, 128, 128, 128, //
  4798. 0, 128, 2, 128, 4, 128, 128, 128, //
  4799. 128, 0, 2, 128, 4, 128, 128, 128, //
  4800. 0, 2, 4, 128, 6, 128, 128, 128, //
  4801. 128, 128, 128, 0, 2, 128, 128, 128, //
  4802. 0, 128, 128, 2, 4, 128, 128, 128, //
  4803. 128, 0, 128, 2, 4, 128, 128, 128, //
  4804. 0, 2, 128, 4, 6, 128, 128, 128, //
  4805. 128, 128, 0, 2, 4, 128, 128, 128, //
  4806. 0, 128, 2, 4, 6, 128, 128, 128, //
  4807. 128, 0, 2, 4, 6, 128, 128, 128, //
  4808. 0, 2, 4, 6, 8, 128, 128, 128, //
  4809. 128, 128, 128, 128, 128, 0, 128, 128, //
  4810. 0, 128, 128, 128, 128, 2, 128, 128, //
  4811. 128, 0, 128, 128, 128, 2, 128, 128, //
  4812. 0, 2, 128, 128, 128, 4, 128, 128, //
  4813. 128, 128, 0, 128, 128, 2, 128, 128, //
  4814. 0, 128, 2, 128, 128, 4, 128, 128, //
  4815. 128, 0, 2, 128, 128, 4, 128, 128, //
  4816. 0, 2, 4, 128, 128, 6, 128, 128, //
  4817. 128, 128, 128, 0, 128, 2, 128, 128, //
  4818. 0, 128, 128, 2, 128, 4, 128, 128, //
  4819. 128, 0, 128, 2, 128, 4, 128, 128, //
  4820. 0, 2, 128, 4, 128, 6, 128, 128, //
  4821. 128, 128, 0, 2, 128, 4, 128, 128, //
  4822. 0, 128, 2, 4, 128, 6, 128, 128, //
  4823. 128, 0, 2, 4, 128, 6, 128, 128, //
  4824. 0, 2, 4, 6, 128, 8, 128, 128, //
  4825. 128, 128, 128, 128, 0, 2, 128, 128, //
  4826. 0, 128, 128, 128, 2, 4, 128, 128, //
  4827. 128, 0, 128, 128, 2, 4, 128, 128, //
  4828. 0, 2, 128, 128, 4, 6, 128, 128, //
  4829. 128, 128, 0, 128, 2, 4, 128, 128, //
  4830. 0, 128, 2, 128, 4, 6, 128, 128, //
  4831. 128, 0, 2, 128, 4, 6, 128, 128, //
  4832. 0, 2, 4, 128, 6, 8, 128, 128, //
  4833. 128, 128, 128, 0, 2, 4, 128, 128, //
  4834. 0, 128, 128, 2, 4, 6, 128, 128, //
  4835. 128, 0, 128, 2, 4, 6, 128, 128, //
  4836. 0, 2, 128, 4, 6, 8, 128, 128, //
  4837. 128, 128, 0, 2, 4, 6, 128, 128, //
  4838. 0, 128, 2, 4, 6, 8, 128, 128, //
  4839. 128, 0, 2, 4, 6, 8, 128, 128, //
  4840. 0, 2, 4, 6, 8, 10, 128, 128, //
  4841. 128, 128, 128, 128, 128, 128, 0, 128, //
  4842. 0, 128, 128, 128, 128, 128, 2, 128, //
  4843. 128, 0, 128, 128, 128, 128, 2, 128, //
  4844. 0, 2, 128, 128, 128, 128, 4, 128, //
  4845. 128, 128, 0, 128, 128, 128, 2, 128, //
  4846. 0, 128, 2, 128, 128, 128, 4, 128, //
  4847. 128, 0, 2, 128, 128, 128, 4, 128, //
  4848. 0, 2, 4, 128, 128, 128, 6, 128, //
  4849. 128, 128, 128, 0, 128, 128, 2, 128, //
  4850. 0, 128, 128, 2, 128, 128, 4, 128, //
  4851. 128, 0, 128, 2, 128, 128, 4, 128, //
  4852. 0, 2, 128, 4, 128, 128, 6, 128, //
  4853. 128, 128, 0, 2, 128, 128, 4, 128, //
  4854. 0, 128, 2, 4, 128, 128, 6, 128, //
  4855. 128, 0, 2, 4, 128, 128, 6, 128, //
  4856. 0, 2, 4, 6, 128, 128, 8, 128, //
  4857. 128, 128, 128, 128, 0, 128, 2, 128, //
  4858. 0, 128, 128, 128, 2, 128, 4, 128, //
  4859. 128, 0, 128, 128, 2, 128, 4, 128, //
  4860. 0, 2, 128, 128, 4, 128, 6, 128, //
  4861. 128, 128, 0, 128, 2, 128, 4, 128, //
  4862. 0, 128, 2, 128, 4, 128, 6, 128, //
  4863. 128, 0, 2, 128, 4, 128, 6, 128, //
  4864. 0, 2, 4, 128, 6, 128, 8, 128, //
  4865. 128, 128, 128, 0, 2, 128, 4, 128, //
  4866. 0, 128, 128, 2, 4, 128, 6, 128, //
  4867. 128, 0, 128, 2, 4, 128, 6, 128, //
  4868. 0, 2, 128, 4, 6, 128, 8, 128, //
  4869. 128, 128, 0, 2, 4, 128, 6, 128, //
  4870. 0, 128, 2, 4, 6, 128, 8, 128, //
  4871. 128, 0, 2, 4, 6, 128, 8, 128, //
  4872. 0, 2, 4, 6, 8, 128, 10, 128, //
  4873. 128, 128, 128, 128, 128, 0, 2, 128, //
  4874. 0, 128, 128, 128, 128, 2, 4, 128, //
  4875. 128, 0, 128, 128, 128, 2, 4, 128, //
  4876. 0, 2, 128, 128, 128, 4, 6, 128, //
  4877. 128, 128, 0, 128, 128, 2, 4, 128, //
  4878. 0, 128, 2, 128, 128, 4, 6, 128, //
  4879. 128, 0, 2, 128, 128, 4, 6, 128, //
  4880. 0, 2, 4, 128, 128, 6, 8, 128, //
  4881. 128, 128, 128, 0, 128, 2, 4, 128, //
  4882. 0, 128, 128, 2, 128, 4, 6, 128, //
  4883. 128, 0, 128, 2, 128, 4, 6, 128, //
  4884. 0, 2, 128, 4, 128, 6, 8, 128, //
  4885. 128, 128, 0, 2, 128, 4, 6, 128, //
  4886. 0, 128, 2, 4, 128, 6, 8, 128, //
  4887. 128, 0, 2, 4, 128, 6, 8, 128, //
  4888. 0, 2, 4, 6, 128, 8, 10, 128, //
  4889. 128, 128, 128, 128, 0, 2, 4, 128, //
  4890. 0, 128, 128, 128, 2, 4, 6, 128, //
  4891. 128, 0, 128, 128, 2, 4, 6, 128, //
  4892. 0, 2, 128, 128, 4, 6, 8, 128, //
  4893. 128, 128, 0, 128, 2, 4, 6, 128, //
  4894. 0, 128, 2, 128, 4, 6, 8, 128, //
  4895. 128, 0, 2, 128, 4, 6, 8, 128, //
  4896. 0, 2, 4, 128, 6, 8, 10, 128, //
  4897. 128, 128, 128, 0, 2, 4, 6, 128, //
  4898. 0, 128, 128, 2, 4, 6, 8, 128, //
  4899. 128, 0, 128, 2, 4, 6, 8, 128, //
  4900. 0, 2, 128, 4, 6, 8, 10, 128, //
  4901. 128, 128, 0, 2, 4, 6, 8, 128, //
  4902. 0, 128, 2, 4, 6, 8, 10, 128, //
  4903. 128, 0, 2, 4, 6, 8, 10, 128, //
  4904. 0, 2, 4, 6, 8, 10, 12, 128, //
  4905. 128, 128, 128, 128, 128, 128, 128, 0, //
  4906. 0, 128, 128, 128, 128, 128, 128, 2, //
  4907. 128, 0, 128, 128, 128, 128, 128, 2, //
  4908. 0, 2, 128, 128, 128, 128, 128, 4, //
  4909. 128, 128, 0, 128, 128, 128, 128, 2, //
  4910. 0, 128, 2, 128, 128, 128, 128, 4, //
  4911. 128, 0, 2, 128, 128, 128, 128, 4, //
  4912. 0, 2, 4, 128, 128, 128, 128, 6, //
  4913. 128, 128, 128, 0, 128, 128, 128, 2, //
  4914. 0, 128, 128, 2, 128, 128, 128, 4, //
  4915. 128, 0, 128, 2, 128, 128, 128, 4, //
  4916. 0, 2, 128, 4, 128, 128, 128, 6, //
  4917. 128, 128, 0, 2, 128, 128, 128, 4, //
  4918. 0, 128, 2, 4, 128, 128, 128, 6, //
  4919. 128, 0, 2, 4, 128, 128, 128, 6, //
  4920. 0, 2, 4, 6, 128, 128, 128, 8, //
  4921. 128, 128, 128, 128, 0, 128, 128, 2, //
  4922. 0, 128, 128, 128, 2, 128, 128, 4, //
  4923. 128, 0, 128, 128, 2, 128, 128, 4, //
  4924. 0, 2, 128, 128, 4, 128, 128, 6, //
  4925. 128, 128, 0, 128, 2, 128, 128, 4, //
  4926. 0, 128, 2, 128, 4, 128, 128, 6, //
  4927. 128, 0, 2, 128, 4, 128, 128, 6, //
  4928. 0, 2, 4, 128, 6, 128, 128, 8, //
  4929. 128, 128, 128, 0, 2, 128, 128, 4, //
  4930. 0, 128, 128, 2, 4, 128, 128, 6, //
  4931. 128, 0, 128, 2, 4, 128, 128, 6, //
  4932. 0, 2, 128, 4, 6, 128, 128, 8, //
  4933. 128, 128, 0, 2, 4, 128, 128, 6, //
  4934. 0, 128, 2, 4, 6, 128, 128, 8, //
  4935. 128, 0, 2, 4, 6, 128, 128, 8, //
  4936. 0, 2, 4, 6, 8, 128, 128, 10, //
  4937. 128, 128, 128, 128, 128, 0, 128, 2, //
  4938. 0, 128, 128, 128, 128, 2, 128, 4, //
  4939. 128, 0, 128, 128, 128, 2, 128, 4, //
  4940. 0, 2, 128, 128, 128, 4, 128, 6, //
  4941. 128, 128, 0, 128, 128, 2, 128, 4, //
  4942. 0, 128, 2, 128, 128, 4, 128, 6, //
  4943. 128, 0, 2, 128, 128, 4, 128, 6, //
  4944. 0, 2, 4, 128, 128, 6, 128, 8, //
  4945. 128, 128, 128, 0, 128, 2, 128, 4, //
  4946. 0, 128, 128, 2, 128, 4, 128, 6, //
  4947. 128, 0, 128, 2, 128, 4, 128, 6, //
  4948. 0, 2, 128, 4, 128, 6, 128, 8, //
  4949. 128, 128, 0, 2, 128, 4, 128, 6, //
  4950. 0, 128, 2, 4, 128, 6, 128, 8, //
  4951. 128, 0, 2, 4, 128, 6, 128, 8, //
  4952. 0, 2, 4, 6, 128, 8, 128, 10, //
  4953. 128, 128, 128, 128, 0, 2, 128, 4, //
  4954. 0, 128, 128, 128, 2, 4, 128, 6, //
  4955. 128, 0, 128, 128, 2, 4, 128, 6, //
  4956. 0, 2, 128, 128, 4, 6, 128, 8, //
  4957. 128, 128, 0, 128, 2, 4, 128, 6, //
  4958. 0, 128, 2, 128, 4, 6, 128, 8, //
  4959. 128, 0, 2, 128, 4, 6, 128, 8, //
  4960. 0, 2, 4, 128, 6, 8, 128, 10, //
  4961. 128, 128, 128, 0, 2, 4, 128, 6, //
  4962. 0, 128, 128, 2, 4, 6, 128, 8, //
  4963. 128, 0, 128, 2, 4, 6, 128, 8, //
  4964. 0, 2, 128, 4, 6, 8, 128, 10, //
  4965. 128, 128, 0, 2, 4, 6, 128, 8, //
  4966. 0, 128, 2, 4, 6, 8, 128, 10, //
  4967. 128, 0, 2, 4, 6, 8, 128, 10, //
  4968. 0, 2, 4, 6, 8, 10, 128, 12, //
  4969. 128, 128, 128, 128, 128, 128, 0, 2, //
  4970. 0, 128, 128, 128, 128, 128, 2, 4, //
  4971. 128, 0, 128, 128, 128, 128, 2, 4, //
  4972. 0, 2, 128, 128, 128, 128, 4, 6, //
  4973. 128, 128, 0, 128, 128, 128, 2, 4, //
  4974. 0, 128, 2, 128, 128, 128, 4, 6, //
  4975. 128, 0, 2, 128, 128, 128, 4, 6, //
  4976. 0, 2, 4, 128, 128, 128, 6, 8, //
  4977. 128, 128, 128, 0, 128, 128, 2, 4, //
  4978. 0, 128, 128, 2, 128, 128, 4, 6, //
  4979. 128, 0, 128, 2, 128, 128, 4, 6, //
  4980. 0, 2, 128, 4, 128, 128, 6, 8, //
  4981. 128, 128, 0, 2, 128, 128, 4, 6, //
  4982. 0, 128, 2, 4, 128, 128, 6, 8, //
  4983. 128, 0, 2, 4, 128, 128, 6, 8, //
  4984. 0, 2, 4, 6, 128, 128, 8, 10, //
  4985. 128, 128, 128, 128, 0, 128, 2, 4, //
  4986. 0, 128, 128, 128, 2, 128, 4, 6, //
  4987. 128, 0, 128, 128, 2, 128, 4, 6, //
  4988. 0, 2, 128, 128, 4, 128, 6, 8, //
  4989. 128, 128, 0, 128, 2, 128, 4, 6, //
  4990. 0, 128, 2, 128, 4, 128, 6, 8, //
  4991. 128, 0, 2, 128, 4, 128, 6, 8, //
  4992. 0, 2, 4, 128, 6, 128, 8, 10, //
  4993. 128, 128, 128, 0, 2, 128, 4, 6, //
  4994. 0, 128, 128, 2, 4, 128, 6, 8, //
  4995. 128, 0, 128, 2, 4, 128, 6, 8, //
  4996. 0, 2, 128, 4, 6, 128, 8, 10, //
  4997. 128, 128, 0, 2, 4, 128, 6, 8, //
  4998. 0, 128, 2, 4, 6, 128, 8, 10, //
  4999. 128, 0, 2, 4, 6, 128, 8, 10, //
  5000. 0, 2, 4, 6, 8, 128, 10, 12, //
  5001. 128, 128, 128, 128, 128, 0, 2, 4, //
  5002. 0, 128, 128, 128, 128, 2, 4, 6, //
  5003. 128, 0, 128, 128, 128, 2, 4, 6, //
  5004. 0, 2, 128, 128, 128, 4, 6, 8, //
  5005. 128, 128, 0, 128, 128, 2, 4, 6, //
  5006. 0, 128, 2, 128, 128, 4, 6, 8, //
  5007. 128, 0, 2, 128, 128, 4, 6, 8, //
  5008. 0, 2, 4, 128, 128, 6, 8, 10, //
  5009. 128, 128, 128, 0, 128, 2, 4, 6, //
  5010. 0, 128, 128, 2, 128, 4, 6, 8, //
  5011. 128, 0, 128, 2, 128, 4, 6, 8, //
  5012. 0, 2, 128, 4, 128, 6, 8, 10, //
  5013. 128, 128, 0, 2, 128, 4, 6, 8, //
  5014. 0, 128, 2, 4, 128, 6, 8, 10, //
  5015. 128, 0, 2, 4, 128, 6, 8, 10, //
  5016. 0, 2, 4, 6, 128, 8, 10, 12, //
  5017. 128, 128, 128, 128, 0, 2, 4, 6, //
  5018. 0, 128, 128, 128, 2, 4, 6, 8, //
  5019. 128, 0, 128, 128, 2, 4, 6, 8, //
  5020. 0, 2, 128, 128, 4, 6, 8, 10, //
  5021. 128, 128, 0, 128, 2, 4, 6, 8, //
  5022. 0, 128, 2, 128, 4, 6, 8, 10, //
  5023. 128, 0, 2, 128, 4, 6, 8, 10, //
  5024. 0, 2, 4, 128, 6, 8, 10, 12, //
  5025. 128, 128, 128, 0, 2, 4, 6, 8, //
  5026. 0, 128, 128, 2, 4, 6, 8, 10, //
  5027. 128, 0, 128, 2, 4, 6, 8, 10, //
  5028. 0, 2, 128, 4, 6, 8, 10, 12, //
  5029. 128, 128, 0, 2, 4, 6, 8, 10, //
  5030. 0, 128, 2, 4, 6, 8, 10, 12, //
  5031. 128, 0, 2, 4, 6, 8, 10, 12, //
  5032. 0, 2, 4, 6, 8, 10, 12, 14};
  5033. // Extend to double length because InterleaveLower will only use the (valid)
  5034. // lower half, and we want N u16.
  5035. const Twice<decltype(du8)> du8x2;
  5036. const Vec128<uint8_t, 2 * N> indices8 =
  5037. ZeroExtendVector(du8x2, Load(du8, table + mask_bits * 8));
  5038. const Vec128<uint16_t, N> indices16 =
  5039. BitCast(du, InterleaveLower(du8x2, indices8, indices8));
  5040. // TableLookupBytesOr0 operates on bytes. To convert u16 lane indices to byte
  5041. // indices, add 0 to even and 1 to odd byte lanes.
  5042. const Vec128<uint16_t, N> byte_indices = Add(
  5043. indices16,
  5044. Set(du, static_cast<uint16_t>(HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001)));
  5045. return BitCast(d, TableLookupBytesOr0(v, byte_indices));
  5046. }
  5047. template <typename T, size_t N, HWY_IF_T_SIZE(T, 4)>
  5048. HWY_API Vec128<T, N> Expand(Vec128<T, N> v, Mask128<T, N> mask) {
  5049. const DFromV<decltype(v)> d;
  5050. const RebindToUnsigned<decltype(d)> du;
  5051. const uint64_t mask_bits = detail::BitsFromMask(mask);
  5052. alignas(16) static constexpr uint32_t packed_array[16] = {
  5053. // PrintExpand64x4Nibble - same for 32x4.
  5054. 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0,
  5055. 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10,
  5056. 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210};
  5057. // For lane i, shift the i-th 4-bit index down to bits [0, 2).
  5058. const Vec128<uint32_t, N> packed = Set(du, packed_array[mask_bits]);
  5059. alignas(16) static constexpr uint32_t shifts[4] = {0, 4, 8, 12};
  5060. Vec128<uint32_t, N> indices = packed >> Load(du, shifts);
  5061. // AVX2 _mm256_permutexvar_epi32 will ignore upper bits, but IndicesFromVec
  5062. // checks bounds, so clear the upper bits.
  5063. indices = And(indices, Set(du, N - 1));
  5064. const Vec128<uint32_t, N> expand =
  5065. TableLookupLanes(BitCast(du, v), IndicesFromVec(du, indices));
  5066. // TableLookupLanes cannot also zero masked-off lanes, so do that now.
  5067. return IfThenElseZero(mask, BitCast(d, expand));
  5068. }
  5069. template <typename T, HWY_IF_T_SIZE(T, 8)>
  5070. HWY_API Vec128<T> Expand(Vec128<T> v, Mask128<T> mask) {
  5071. // Same as Compress, just zero out the mask=false lanes.
  5072. return IfThenElseZero(mask, Compress(v, mask));
  5073. }
  5074. // For single-element vectors, this is at least as fast as native.
  5075. template <typename T>
  5076. HWY_API Vec128<T, 1> Expand(Vec128<T, 1> v, Mask128<T, 1> mask) {
  5077. return IfThenElseZero(mask, v);
  5078. }
  5079. // ------------------------------ LoadExpand
  5080. template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  5081. HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
  5082. const TFromD<D>* HWY_RESTRICT unaligned) {
  5083. return Expand(LoadU(d, unaligned), mask);
  5084. }
  5085. #endif // HWY_NATIVE_EXPAND
  5086. // ------------------------------ TwoTablesLookupLanes
  5087. template <class D>
  5088. using IndicesFromD = decltype(IndicesFromVec(D(), Zero(RebindToUnsigned<D>())));
  5089. // RVV/SVE have their own implementations of
  5090. // TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b, IndicesFromD<D> idx)
  5091. #if HWY_TARGET != HWY_RVV && !HWY_TARGET_IS_SVE
  5092. template <class D>
  5093. HWY_API VFromD<D> TwoTablesLookupLanes(D /*d*/, VFromD<D> a, VFromD<D> b,
  5094. IndicesFromD<D> idx) {
  5095. return TwoTablesLookupLanes(a, b, idx);
  5096. }
  5097. #endif
  5098. // ------------------------------ Reverse2, Reverse4, Reverse8 (8-bit)
  5099. #if (defined(HWY_NATIVE_REVERSE2_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE
  5100. #ifdef HWY_NATIVE_REVERSE2_8
  5101. #undef HWY_NATIVE_REVERSE2_8
  5102. #else
  5103. #define HWY_NATIVE_REVERSE2_8
  5104. #endif
  5105. #undef HWY_PREFER_ROTATE
  5106. // Platforms on which RotateRight is likely faster than TableLookupBytes.
  5107. // RVV and SVE anyway have their own implementation of this.
  5108. #if HWY_TARGET == HWY_SSE2 || HWY_TARGET <= HWY_AVX3 || \
  5109. HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_PPC8
  5110. #define HWY_PREFER_ROTATE 1
  5111. #else
  5112. #define HWY_PREFER_ROTATE 0
  5113. #endif
  5114. template <class D, HWY_IF_T_SIZE_D(D, 1)>
  5115. HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
  5116. // Exclude AVX3 because its 16-bit RotateRight is actually 3 instructions.
  5117. #if HWY_PREFER_ROTATE && HWY_TARGET > HWY_AVX3
  5118. const Repartition<uint16_t, decltype(d)> du16;
  5119. return BitCast(d, RotateRight<8>(BitCast(du16, v)));
  5120. #else
  5121. const VFromD<D> shuffle = Dup128VecFromValues(d, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,
  5122. 11, 10, 13, 12, 15, 14);
  5123. return TableLookupBytes(v, shuffle);
  5124. #endif
  5125. }
  5126. template <class D, HWY_IF_T_SIZE_D(D, 1)>
  5127. HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
  5128. #if HWY_PREFER_ROTATE
  5129. const Repartition<uint16_t, decltype(d)> du16;
  5130. return BitCast(d, Reverse2(du16, BitCast(du16, Reverse2(d, v))));
  5131. #else
  5132. const Repartition<uint8_t, decltype(d)> du8;
  5133. const VFromD<decltype(du8)> shuffle = Dup128VecFromValues(
  5134. du8, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
  5135. return TableLookupBytes(v, BitCast(d, shuffle));
  5136. #endif
  5137. }
  5138. template <class D, HWY_IF_T_SIZE_D(D, 1)>
  5139. HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
  5140. #if HWY_PREFER_ROTATE
  5141. const Repartition<uint32_t, D> du32;
  5142. return BitCast(d, Reverse2(du32, BitCast(du32, Reverse4(d, v))));
  5143. #else
  5144. const Repartition<uint8_t, decltype(d)> du8;
  5145. const VFromD<decltype(du8)> shuffle = Dup128VecFromValues(
  5146. du8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
  5147. return TableLookupBytes(v, BitCast(d, shuffle));
  5148. #endif
  5149. }
  5150. #endif // HWY_NATIVE_REVERSE2_8
  5151. // ------------------------------ ReverseLaneBytes
  5152. #if (defined(HWY_NATIVE_REVERSE_LANE_BYTES) == defined(HWY_TARGET_TOGGLE))
  5153. #ifdef HWY_NATIVE_REVERSE_LANE_BYTES
  5154. #undef HWY_NATIVE_REVERSE_LANE_BYTES
  5155. #else
  5156. #define HWY_NATIVE_REVERSE_LANE_BYTES
  5157. #endif
  5158. template <class V, HWY_IF_T_SIZE_V(V, 2)>
  5159. HWY_API V ReverseLaneBytes(V v) {
  5160. const DFromV<V> d;
  5161. const Repartition<uint8_t, decltype(d)> du8;
  5162. return BitCast(d, Reverse2(du8, BitCast(du8, v)));
  5163. }
  5164. template <class V, HWY_IF_T_SIZE_V(V, 4)>
  5165. HWY_API V ReverseLaneBytes(V v) {
  5166. const DFromV<V> d;
  5167. const Repartition<uint8_t, decltype(d)> du8;
  5168. return BitCast(d, Reverse4(du8, BitCast(du8, v)));
  5169. }
  5170. template <class V, HWY_IF_T_SIZE_V(V, 8)>
  5171. HWY_API V ReverseLaneBytes(V v) {
  5172. const DFromV<V> d;
  5173. const Repartition<uint8_t, decltype(d)> du8;
  5174. return BitCast(d, Reverse8(du8, BitCast(du8, v)));
  5175. }
  5176. #endif // HWY_NATIVE_REVERSE_LANE_BYTES
  5177. // ------------------------------ ReverseBits
  5178. // On these targets, we emulate 8-bit shifts using 16-bit shifts and therefore
  5179. // require at least two lanes to BitCast to 16-bit. We avoid Highway's 8-bit
  5180. // shifts because those would add extra masking already taken care of by
  5181. // UI8ReverseBitsStep. Note that AVX3_DL/AVX3_ZEN4 support GFNI and use it to
  5182. // implement ReverseBits, so this code is not used there.
  5183. #undef HWY_REVERSE_BITS_MIN_BYTES
  5184. #if ((HWY_TARGET >= HWY_AVX3 && HWY_TARGET <= HWY_SSE2) || \
  5185. HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256)
  5186. #define HWY_REVERSE_BITS_MIN_BYTES 2
  5187. #else
  5188. #define HWY_REVERSE_BITS_MIN_BYTES 1
  5189. #endif
  5190. #if (defined(HWY_NATIVE_REVERSE_BITS_UI8) == defined(HWY_TARGET_TOGGLE))
  5191. #ifdef HWY_NATIVE_REVERSE_BITS_UI8
  5192. #undef HWY_NATIVE_REVERSE_BITS_UI8
  5193. #else
  5194. #define HWY_NATIVE_REVERSE_BITS_UI8
  5195. #endif
  5196. namespace detail {
  5197. template <int kShiftAmt, int kShrResultMask, class V,
  5198. HWY_IF_V_SIZE_GT_D(DFromV<V>, HWY_REVERSE_BITS_MIN_BYTES - 1)>
  5199. HWY_INLINE V UI8ReverseBitsStep(V v) {
  5200. const DFromV<decltype(v)> d;
  5201. const RebindToUnsigned<decltype(d)> du;
  5202. #if HWY_REVERSE_BITS_MIN_BYTES == 2
  5203. const Repartition<uint16_t, decltype(d)> d_shift;
  5204. #else
  5205. const RebindToUnsigned<decltype(d)> d_shift;
  5206. #endif
  5207. const auto v_to_shift = BitCast(d_shift, v);
  5208. const auto shl_result = BitCast(d, ShiftLeft<kShiftAmt>(v_to_shift));
  5209. const auto shr_result = BitCast(d, ShiftRight<kShiftAmt>(v_to_shift));
  5210. const auto shr_result_mask =
  5211. BitCast(d, Set(du, static_cast<uint8_t>(kShrResultMask)));
  5212. return Or(And(shr_result, shr_result_mask),
  5213. AndNot(shr_result_mask, shl_result));
  5214. }
  5215. #if HWY_REVERSE_BITS_MIN_BYTES == 2
  5216. template <int kShiftAmt, int kShrResultMask, class V,
  5217. HWY_IF_V_SIZE_D(DFromV<V>, 1)>
  5218. HWY_INLINE V UI8ReverseBitsStep(V v) {
  5219. return V{UI8ReverseBitsStep<kShiftAmt, kShrResultMask>(Vec128<uint8_t>{v.raw})
  5220. .raw};
  5221. }
  5222. #endif
  5223. } // namespace detail
  5224. template <class V, HWY_IF_T_SIZE_V(V, 1)>
  5225. HWY_API V ReverseBits(V v) {
  5226. auto result = detail::UI8ReverseBitsStep<1, 0x55>(v);
  5227. result = detail::UI8ReverseBitsStep<2, 0x33>(result);
  5228. result = detail::UI8ReverseBitsStep<4, 0x0F>(result);
  5229. return result;
  5230. }
  5231. #endif // HWY_NATIVE_REVERSE_BITS_UI8
  5232. #if (defined(HWY_NATIVE_REVERSE_BITS_UI16_32_64) == defined(HWY_TARGET_TOGGLE))
  5233. #ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64
  5234. #undef HWY_NATIVE_REVERSE_BITS_UI16_32_64
  5235. #else
  5236. #define HWY_NATIVE_REVERSE_BITS_UI16_32_64
  5237. #endif
  5238. template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8)),
  5239. HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  5240. HWY_API V ReverseBits(V v) {
  5241. const DFromV<decltype(v)> d;
  5242. const Repartition<uint8_t, decltype(d)> du8;
  5243. return ReverseLaneBytes(BitCast(d, ReverseBits(BitCast(du8, v))));
  5244. }
  5245. #endif // HWY_NATIVE_REVERSE_BITS_UI16_32_64
  5246. // ------------------------------ Per4LaneBlockShuffle
  5247. #if (defined(HWY_NATIVE_PER4LANEBLKSHUF_DUP32) == defined(HWY_TARGET_TOGGLE))
  5248. #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
  5249. #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
  5250. #else
  5251. #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
  5252. #endif
  5253. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  5254. namespace detail {
  5255. template <class D>
  5256. HWY_INLINE Vec<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
  5257. const uint32_t x2,
  5258. const uint32_t x1,
  5259. const uint32_t x0) {
  5260. #if HWY_TARGET == HWY_RVV
  5261. constexpr int kPow2 = d.Pow2();
  5262. constexpr int kLoadPow2 = HWY_MAX(kPow2, -1);
  5263. const ScalableTag<uint32_t, kLoadPow2> d_load;
  5264. #else
  5265. constexpr size_t kMaxBytes = d.MaxBytes();
  5266. #if HWY_TARGET_IS_NEON
  5267. constexpr size_t kMinLanesToLoad = 2;
  5268. #else
  5269. constexpr size_t kMinLanesToLoad = 4;
  5270. #endif
  5271. constexpr size_t kNumToLoad =
  5272. HWY_MAX(kMaxBytes / sizeof(uint32_t), kMinLanesToLoad);
  5273. const CappedTag<uint32_t, kNumToLoad> d_load;
  5274. #endif
  5275. return ResizeBitCast(d, Dup128VecFromValues(d_load, x0, x1, x2, x3));
  5276. }
  5277. } // namespace detail
  5278. #endif
  5279. #endif // HWY_NATIVE_PER4LANEBLKSHUF_DUP32
  5280. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  5281. namespace detail {
  5282. template <class V>
  5283. HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<0> /*idx_10_tag*/, V v) {
  5284. return DupEven(v);
  5285. }
  5286. template <class V>
  5287. HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<1> /*idx_10_tag*/, V v) {
  5288. const DFromV<decltype(v)> d;
  5289. return Reverse2(d, v);
  5290. }
  5291. template <class V>
  5292. HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<2> /*idx_10_tag*/, V v) {
  5293. return v;
  5294. }
  5295. template <class V>
  5296. HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<3> /*idx_10_tag*/, V v) {
  5297. return DupOdd(v);
  5298. }
  5299. HWY_INLINE uint32_t U8x4Per4LaneBlkIndices(const uint32_t idx3,
  5300. const uint32_t idx2,
  5301. const uint32_t idx1,
  5302. const uint32_t idx0) {
  5303. #if HWY_IS_LITTLE_ENDIAN
  5304. return static_cast<uint32_t>((idx3 << 24) | (idx2 << 16) | (idx1 << 8) |
  5305. idx0);
  5306. #else
  5307. return static_cast<uint32_t>(idx3 | (idx2 << 8) | (idx1 << 16) |
  5308. (idx0 << 24));
  5309. #endif
  5310. }
  5311. template <class D>
  5312. HWY_INLINE Vec<D> TblLookupPer4LaneBlkU8IdxInBlk(D d, const uint32_t idx3,
  5313. const uint32_t idx2,
  5314. const uint32_t idx1,
  5315. const uint32_t idx0) {
  5316. #if HWY_TARGET == HWY_RVV
  5317. const AdjustSimdTagToMinVecPow2<Repartition<uint32_t, D>> du32;
  5318. #else
  5319. const Repartition<uint32_t, D> du32;
  5320. #endif
  5321. return ResizeBitCast(
  5322. d, Set(du32, U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0)));
  5323. }
  5324. #if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE || HWY_TARGET == HWY_EMU128
  5325. #define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) void* = nullptr
  5326. #else
  5327. #define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) HWY_IF_T_SIZE_D(D, 8)
  5328. template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))>
  5329. HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, V idx) {
  5330. const DFromV<decltype(v)> d;
  5331. const Repartition<uint8_t, decltype(d)> du8;
  5332. return BitCast(d, TableLookupBytes(BitCast(du8, v), BitCast(du8, idx)));
  5333. }
  5334. template <class D, HWY_IF_T_SIZE_D(D, 1)>
  5335. HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
  5336. const uint32_t idx2,
  5337. const uint32_t idx1,
  5338. const uint32_t idx0) {
  5339. const Repartition<uint32_t, decltype(d)> du32;
  5340. const uint32_t idx3210 = U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0);
  5341. const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32(
  5342. du32, static_cast<uint32_t>(idx3210 + 0x0C0C0C0C),
  5343. static_cast<uint32_t>(idx3210 + 0x08080808),
  5344. static_cast<uint32_t>(idx3210 + 0x04040404),
  5345. static_cast<uint32_t>(idx3210));
  5346. return ResizeBitCast(d, v_byte_idx);
  5347. }
  5348. template <class D, HWY_IF_T_SIZE_D(D, 2)>
  5349. HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
  5350. const uint32_t idx2,
  5351. const uint32_t idx1,
  5352. const uint32_t idx0) {
  5353. const Repartition<uint32_t, decltype(d)> du32;
  5354. #if HWY_IS_LITTLE_ENDIAN
  5355. const uint32_t idx10 = static_cast<uint32_t>((idx1 << 16) | idx0);
  5356. const uint32_t idx32 = static_cast<uint32_t>((idx3 << 16) | idx2);
  5357. constexpr uint32_t kLaneByteOffsets{0x01000100};
  5358. #else
  5359. const uint32_t idx10 = static_cast<uint32_t>(idx1 | (idx0 << 16));
  5360. const uint32_t idx32 = static_cast<uint32_t>(idx3 | (idx2 << 16));
  5361. constexpr uint32_t kLaneByteOffsets{0x00010001};
  5362. #endif
  5363. constexpr uint32_t kHiLaneByteOffsets{kLaneByteOffsets + 0x08080808u};
  5364. const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32(
  5365. du32, static_cast<uint32_t>(idx32 * 0x0202u + kHiLaneByteOffsets),
  5366. static_cast<uint32_t>(idx10 * 0x0202u + kHiLaneByteOffsets),
  5367. static_cast<uint32_t>(idx32 * 0x0202u + kLaneByteOffsets),
  5368. static_cast<uint32_t>(idx10 * 0x0202u + kLaneByteOffsets));
  5369. return ResizeBitCast(d, v_byte_idx);
  5370. }
  5371. template <class D, HWY_IF_T_SIZE_D(D, 4)>
  5372. HWY_INLINE Vec<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
  5373. const uint32_t idx2,
  5374. const uint32_t idx1,
  5375. const uint32_t idx0) {
  5376. const Repartition<uint32_t, decltype(d)> du32;
  5377. #if HWY_IS_LITTLE_ENDIAN
  5378. constexpr uint32_t kLaneByteOffsets{0x03020100};
  5379. #else
  5380. constexpr uint32_t kLaneByteOffsets{0x00010203};
  5381. #endif
  5382. const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32(
  5383. du32, static_cast<uint32_t>(idx3 * 0x04040404u + kLaneByteOffsets),
  5384. static_cast<uint32_t>(idx2 * 0x04040404u + kLaneByteOffsets),
  5385. static_cast<uint32_t>(idx1 * 0x04040404u + kLaneByteOffsets),
  5386. static_cast<uint32_t>(idx0 * 0x04040404u + kLaneByteOffsets));
  5387. return ResizeBitCast(d, v_byte_idx);
  5388. }
  5389. #endif
  5390. template <class D, HWY_IF_T_SIZE_D(D, 1)>
  5391. HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
  5392. const uint32_t idx2,
  5393. const uint32_t idx1,
  5394. const uint32_t idx0) {
  5395. return TblLookupPer4LaneBlkU8IdxInBlk(d, idx3, idx2, idx1, idx0);
  5396. }
  5397. #if HWY_TARGET == HWY_RVV
  5398. template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
  5399. HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
  5400. const uint32_t idx2,
  5401. const uint32_t idx1,
  5402. const uint32_t idx0) {
  5403. const Rebind<uint8_t, decltype(d)> du8;
  5404. return PromoteTo(d,
  5405. TblLookupPer4LaneBlkU8IdxInBlk(du8, idx3, idx2, idx1, idx0));
  5406. }
  5407. #else
  5408. template <class D, HWY_IF_T_SIZE_D(D, 2)>
  5409. HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
  5410. const uint32_t idx2,
  5411. const uint32_t idx1,
  5412. const uint32_t idx0) {
  5413. const uint16_t u16_idx0 = static_cast<uint16_t>(idx0);
  5414. const uint16_t u16_idx1 = static_cast<uint16_t>(idx1);
  5415. const uint16_t u16_idx2 = static_cast<uint16_t>(idx2);
  5416. const uint16_t u16_idx3 = static_cast<uint16_t>(idx3);
  5417. #if HWY_TARGET_IS_NEON
  5418. constexpr size_t kMinLanesToLoad = 4;
  5419. #else
  5420. constexpr size_t kMinLanesToLoad = 8;
  5421. #endif
  5422. constexpr size_t kNumToLoad = HWY_MAX(HWY_MAX_LANES_D(D), kMinLanesToLoad);
  5423. const CappedTag<uint16_t, kNumToLoad> d_load;
  5424. return ResizeBitCast(
  5425. d, Dup128VecFromValues(d_load, u16_idx0, u16_idx1, u16_idx2, u16_idx3,
  5426. u16_idx0, u16_idx1, u16_idx2, u16_idx3));
  5427. }
  5428. template <class D, HWY_IF_T_SIZE_D(D, 4)>
  5429. HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
  5430. const uint32_t idx2,
  5431. const uint32_t idx1,
  5432. const uint32_t idx0) {
  5433. return Per4LaneBlkShufDupSet4xU32(d, idx3, idx2, idx1, idx0);
  5434. }
  5435. template <class D, HWY_IF_T_SIZE_D(D, 8)>
  5436. HWY_INLINE VFromD<D> TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3,
  5437. const uint32_t idx2,
  5438. const uint32_t idx1,
  5439. const uint32_t idx0) {
  5440. const RebindToUnsigned<decltype(d)> du;
  5441. const Rebind<uint32_t, decltype(d)> du32;
  5442. return BitCast(d, PromoteTo(du, Per4LaneBlkShufDupSet4xU32(du32, idx3, idx2,
  5443. idx1, idx0)));
  5444. }
  5445. #endif
  5446. template <class D, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D)>
  5447. HWY_INLINE IndicesFromD<D> TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3,
  5448. const uint32_t idx2,
  5449. const uint32_t idx1,
  5450. const uint32_t idx0) {
  5451. const RebindToUnsigned<decltype(d)> du;
  5452. using TU = TFromD<decltype(du)>;
  5453. auto idx_in_blk = TblLookupPer4LaneBlkIdxInBlk(du, idx3, idx2, idx1, idx0);
  5454. constexpr size_t kN = HWY_MAX_LANES_D(D);
  5455. if (kN < 4) {
  5456. idx_in_blk = And(idx_in_blk, Set(du, static_cast<TU>(kN - 1)));
  5457. }
  5458. #if HWY_TARGET == HWY_RVV
  5459. const auto blk_offsets = AndS(Iota0(du), static_cast<TU>(~TU{3}));
  5460. #else
  5461. const auto blk_offsets =
  5462. And(Iota(du, TU{0}), Set(du, static_cast<TU>(~TU{3})));
  5463. #endif
  5464. return IndicesFromVec(d, Add(idx_in_blk, blk_offsets));
  5465. }
  5466. template <class V, HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(DFromV<V>)>
  5467. HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, IndicesFromD<DFromV<V>> idx) {
  5468. return TableLookupLanes(v, idx);
  5469. }
  5470. #undef HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE
  5471. template <class V>
  5472. HWY_INLINE V TblLookupPer4LaneBlkShuf(V v, size_t idx3210) {
  5473. const DFromV<decltype(v)> d;
  5474. const uint32_t idx3 = static_cast<uint32_t>((idx3210 >> 6) & 3);
  5475. const uint32_t idx2 = static_cast<uint32_t>((idx3210 >> 4) & 3);
  5476. const uint32_t idx1 = static_cast<uint32_t>((idx3210 >> 2) & 3);
  5477. const uint32_t idx0 = static_cast<uint32_t>(idx3210 & 3);
  5478. const auto idx = TblLookupPer4LaneBlkShufIdx(d, idx3, idx2, idx1, idx0);
  5479. return Per4LaneBlkShufDoTblLookup(v, idx);
  5480. }
  5481. // The detail::Per4LaneBlockShuffle overloads that have the extra lane_size_tag
  5482. // and vect_size_tag parameters are only called for vectors that have at
  5483. // least 4 lanes (or scalable vectors that might possibly have 4 or more lanes)
  5484. template <size_t kIdx3210, size_t kLaneSize, size_t kVectSize, class V>
  5485. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> /*idx_3210_tag*/,
  5486. hwy::SizeTag<kLaneSize> /*lane_size_tag*/,
  5487. hwy::SizeTag<kVectSize> /*vect_size_tag*/,
  5488. V v) {
  5489. return TblLookupPer4LaneBlkShuf(v, kIdx3210);
  5490. }
  5491. #if HWY_HAVE_FLOAT64
  5492. template <class V>
  5493. HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> Per4LaneBlockShufCastToWide(
  5494. hwy::FloatTag /* type_tag */, hwy::SizeTag<4> /* lane_size_tag */, V v) {
  5495. const DFromV<decltype(v)> d;
  5496. const RepartitionToWide<decltype(d)> dw;
  5497. return BitCast(dw, v);
  5498. }
  5499. #endif
  5500. template <size_t kLaneSize, class V>
  5501. HWY_INLINE VFromD<RepartitionToWide<RebindToUnsigned<DFromV<V>>>>
  5502. Per4LaneBlockShufCastToWide(hwy::FloatTag /* type_tag */,
  5503. hwy::SizeTag<kLaneSize> /* lane_size_tag */, V v) {
  5504. const DFromV<decltype(v)> d;
  5505. const RebindToUnsigned<decltype(d)> du;
  5506. const RepartitionToWide<decltype(du)> dw;
  5507. return BitCast(dw, v);
  5508. }
  5509. template <size_t kLaneSize, class V>
  5510. HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> Per4LaneBlockShufCastToWide(
  5511. hwy::NonFloatTag /* type_tag */,
  5512. hwy::SizeTag<kLaneSize> /* lane_size_tag */, V v) {
  5513. const DFromV<decltype(v)> d;
  5514. const RepartitionToWide<decltype(d)> dw;
  5515. return BitCast(dw, v);
  5516. }
  5517. template <class V>
  5518. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x1B> /*idx_3210_tag*/, V v) {
  5519. const DFromV<decltype(v)> d;
  5520. return Reverse4(d, v);
  5521. }
  5522. template <class V,
  5523. HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
  5524. (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
  5525. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x44> /*idx_3210_tag*/, V v) {
  5526. const DFromV<decltype(v)> d;
  5527. const auto vw = Per4LaneBlockShufCastToWide(
  5528. hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v);
  5529. return BitCast(d, DupEven(vw));
  5530. }
  5531. template <class V,
  5532. HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
  5533. (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
  5534. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) {
  5535. const DFromV<decltype(v)> d;
  5536. const auto vw = Per4LaneBlockShufCastToWide(
  5537. hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v);
  5538. const DFromV<decltype(vw)> dw;
  5539. return BitCast(d, Reverse2(dw, vw));
  5540. }
  5541. #if HWY_MAX_BYTES >= 32
  5542. template <class V, HWY_IF_T_SIZE_V(V, 8)>
  5543. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) {
  5544. return SwapAdjacentBlocks(v);
  5545. }
  5546. #endif
  5547. template <class V, HWY_IF_LANES_D(DFromV<V>, 4),
  5548. HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
  5549. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) {
  5550. const DFromV<decltype(v)> d;
  5551. return InterleaveLower(d, v, v);
  5552. }
  5553. template <class V, HWY_IF_T_SIZE_V(V, 4)>
  5554. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) {
  5555. const DFromV<decltype(v)> d;
  5556. return InterleaveLower(d, v, v);
  5557. }
  5558. template <class V, HWY_IF_LANES_D(DFromV<V>, 4)>
  5559. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x88> /*idx_3210_tag*/, V v) {
  5560. const DFromV<decltype(v)> d;
  5561. return ConcatEven(d, v, v);
  5562. }
  5563. template <class V>
  5564. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xA0> /*idx_3210_tag*/, V v) {
  5565. return DupEven(v);
  5566. }
  5567. template <class V>
  5568. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xB1> /*idx_3210_tag*/, V v) {
  5569. const DFromV<decltype(v)> d;
  5570. return Reverse2(d, v);
  5571. }
  5572. template <class V, HWY_IF_LANES_D(DFromV<V>, 4)>
  5573. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> /*idx_3210_tag*/, V v) {
  5574. const DFromV<decltype(v)> d;
  5575. return ConcatOdd(d, v, v);
  5576. }
  5577. template <class V>
  5578. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xE4> /*idx_3210_tag*/, V v) {
  5579. return v;
  5580. }
  5581. template <class V,
  5582. HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) |
  5583. (HWY_HAVE_INTEGER64 ? (1 << 4) : 0))>
  5584. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xEE> /*idx_3210_tag*/, V v) {
  5585. const DFromV<decltype(v)> d;
  5586. const auto vw = Per4LaneBlockShufCastToWide(
  5587. hwy::IsFloatTag<TFromV<V>>(), hwy::SizeTag<sizeof(TFromV<V>)>(), v);
  5588. return BitCast(d, DupOdd(vw));
  5589. }
  5590. template <class V>
  5591. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xF5> /*idx_3210_tag*/, V v) {
  5592. return DupOdd(v);
  5593. }
  5594. template <class V, HWY_IF_T_SIZE_V(V, 4)>
  5595. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xFA> /*idx_3210_tag*/, V v) {
  5596. const DFromV<decltype(v)> d;
  5597. return InterleaveUpper(d, v, v);
  5598. }
  5599. template <size_t kIdx3210, class V>
  5600. HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210> idx_3210_tag, V v) {
  5601. const DFromV<decltype(v)> d;
  5602. return Per4LaneBlockShuffle(idx_3210_tag, hwy::SizeTag<sizeof(TFromV<V>)>(),
  5603. hwy::SizeTag<d.MaxBytes()>(), v);
  5604. }
  5605. } // namespace detail
  5606. #endif // HWY_TARGET != HWY_SCALAR
  5607. template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
  5608. HWY_IF_LANES_D(DFromV<V>, 1)>
  5609. HWY_API V Per4LaneBlockShuffle(V v) {
  5610. static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true");
  5611. static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true");
  5612. static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true");
  5613. static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true");
  5614. return v;
  5615. }
  5616. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  5617. template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
  5618. HWY_IF_LANES_D(DFromV<V>, 2)>
  5619. HWY_API V Per4LaneBlockShuffle(V v) {
  5620. static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true");
  5621. static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true");
  5622. static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true");
  5623. static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true");
  5624. constexpr bool isReverse2 = (kIdx0 == 1 || kIdx1 == 0) && (kIdx0 != kIdx1);
  5625. constexpr size_t kPer2BlkIdx0 = (kIdx0 <= 1) ? kIdx0 : (isReverse2 ? 1 : 0);
  5626. constexpr size_t kPer2BlkIdx1 = (kIdx1 <= 1) ? kIdx1 : (isReverse2 ? 0 : 1);
  5627. constexpr size_t kIdx10 = (kPer2BlkIdx1 << 1) | kPer2BlkIdx0;
  5628. static_assert(kIdx10 <= 3, "kIdx10 <= 3 must be true");
  5629. return detail::Per2LaneBlockShuffle(hwy::SizeTag<kIdx10>(), v);
  5630. }
  5631. template <size_t kIdx3, size_t kIdx2, size_t kIdx1, size_t kIdx0, class V,
  5632. HWY_IF_LANES_GT_D(DFromV<V>, 2)>
  5633. HWY_API V Per4LaneBlockShuffle(V v) {
  5634. static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true");
  5635. static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true");
  5636. static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true");
  5637. static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true");
  5638. constexpr size_t kIdx3210 =
  5639. (kIdx3 << 6) | (kIdx2 << 4) | (kIdx1 << 2) | kIdx0;
  5640. return detail::Per4LaneBlockShuffle(hwy::SizeTag<kIdx3210>(), v);
  5641. }
  5642. #endif
  5643. // ------------------------------ Blocks
  5644. template <class D>
  5645. HWY_API size_t Blocks(D d) {
  5646. return (d.MaxBytes() <= 16) ? 1 : ((Lanes(d) * sizeof(TFromD<D>) + 15) / 16);
  5647. }
  5648. // ------------------------------ Block insert/extract/broadcast ops
  5649. #if (defined(HWY_NATIVE_BLK_INSERT_EXTRACT) == defined(HWY_TARGET_TOGGLE))
  5650. #ifdef HWY_NATIVE_BLK_INSERT_EXTRACT
  5651. #undef HWY_NATIVE_BLK_INSERT_EXTRACT
  5652. #else
  5653. #define HWY_NATIVE_BLK_INSERT_EXTRACT
  5654. #endif
  5655. template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
  5656. HWY_API V InsertBlock(V /*v*/, V blk_to_insert) {
  5657. static_assert(kBlockIdx == 0, "Invalid block index");
  5658. return blk_to_insert;
  5659. }
  5660. template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
  5661. HWY_API V ExtractBlock(V v) {
  5662. static_assert(kBlockIdx == 0, "Invalid block index");
  5663. return v;
  5664. }
  5665. template <int kBlockIdx, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
  5666. HWY_API V BroadcastBlock(V v) {
  5667. static_assert(kBlockIdx == 0, "Invalid block index");
  5668. return v;
  5669. }
  5670. #endif // HWY_NATIVE_BLK_INSERT_EXTRACT
  5671. // ------------------------------ BroadcastLane
  5672. #if (defined(HWY_NATIVE_BROADCASTLANE) == defined(HWY_TARGET_TOGGLE))
  5673. #ifdef HWY_NATIVE_BROADCASTLANE
  5674. #undef HWY_NATIVE_BROADCASTLANE
  5675. #else
  5676. #define HWY_NATIVE_BROADCASTLANE
  5677. #endif
  5678. template <int kLane, class V, HWY_IF_V_SIZE_LE_V(V, 16)>
  5679. HWY_API V BroadcastLane(V v) {
  5680. return Broadcast<kLane>(v);
  5681. }
  5682. #endif // HWY_NATIVE_BROADCASTLANE
  5683. // ------------------------------ Slide1Up and Slide1Down
  5684. #if (defined(HWY_NATIVE_SLIDE1_UP_DOWN) == defined(HWY_TARGET_TOGGLE))
  5685. #ifdef HWY_NATIVE_SLIDE1_UP_DOWN
  5686. #undef HWY_NATIVE_SLIDE1_UP_DOWN
  5687. #else
  5688. #define HWY_NATIVE_SLIDE1_UP_DOWN
  5689. #endif
  5690. template <class D, HWY_IF_LANES_D(D, 1)>
  5691. HWY_API VFromD<D> Slide1Up(D d, VFromD<D> /*v*/) {
  5692. return Zero(d);
  5693. }
  5694. template <class D, HWY_IF_LANES_D(D, 1)>
  5695. HWY_API VFromD<D> Slide1Down(D d, VFromD<D> /*v*/) {
  5696. return Zero(d);
  5697. }
  5698. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  5699. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
  5700. HWY_API VFromD<D> Slide1Up(D d, VFromD<D> v) {
  5701. return ShiftLeftLanes<1>(d, v);
  5702. }
  5703. template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_GT_D(D, 1)>
  5704. HWY_API VFromD<D> Slide1Down(D d, VFromD<D> v) {
  5705. return ShiftRightLanes<1>(d, v);
  5706. }
  5707. #endif // HWY_TARGET != HWY_SCALAR
  5708. #endif // HWY_NATIVE_SLIDE1_UP_DOWN
  5709. // ------------------------------ SlideUpBlocks
  5710. template <int kBlocks, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  5711. HWY_API VFromD<D> SlideUpBlocks(D /*d*/, VFromD<D> v) {
  5712. static_assert(kBlocks == 0, "kBlocks == 0 must be true");
  5713. return v;
  5714. }
  5715. #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256
  5716. template <int kBlocks, class D, HWY_IF_V_SIZE_GT_D(D, 16)>
  5717. HWY_API VFromD<D> SlideUpBlocks(D d, VFromD<D> v) {
  5718. static_assert(0 <= kBlocks && static_cast<size_t>(kBlocks) < d.MaxBlocks(),
  5719. "kBlocks must be between 0 and d.MaxBlocks() - 1");
  5720. constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
  5721. return SlideUpLanes(d, v, static_cast<size_t>(kBlocks) * kLanesPerBlock);
  5722. }
  5723. #endif
  5724. // ------------------------------ SlideDownBlocks
  5725. template <int kBlocks, class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  5726. HWY_API VFromD<D> SlideDownBlocks(D /*d*/, VFromD<D> v) {
  5727. static_assert(kBlocks == 0, "kBlocks == 0 must be true");
  5728. return v;
  5729. }
  5730. #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256
  5731. template <int kBlocks, class D, HWY_IF_V_SIZE_GT_D(D, 16)>
  5732. HWY_API VFromD<D> SlideDownBlocks(D d, VFromD<D> v) {
  5733. static_assert(0 <= kBlocks && static_cast<size_t>(kBlocks) < d.MaxBlocks(),
  5734. "kBlocks must be between 0 and d.MaxBlocks() - 1");
  5735. constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD<D>);
  5736. return SlideDownLanes(d, v, static_cast<size_t>(kBlocks) * kLanesPerBlock);
  5737. }
  5738. #endif
  5739. // ------------------------------ Slide mask up/down
  5740. #if (defined(HWY_NATIVE_SLIDE_MASK) == defined(HWY_TARGET_TOGGLE))
  5741. #ifdef HWY_NATIVE_SLIDE_MASK
  5742. #undef HWY_NATIVE_SLIDE_MASK
  5743. #else
  5744. #define HWY_NATIVE_SLIDE_MASK
  5745. #endif
  5746. template <class D>
  5747. HWY_API Mask<D> SlideMask1Up(D d, Mask<D> m) {
  5748. return MaskFromVec(Slide1Up(d, VecFromMask(d, m)));
  5749. }
  5750. template <class D>
  5751. HWY_API Mask<D> SlideMask1Down(D d, Mask<D> m) {
  5752. return MaskFromVec(Slide1Down(d, VecFromMask(d, m)));
  5753. }
  5754. template <class D>
  5755. HWY_API Mask<D> SlideMaskUpLanes(D d, Mask<D> m, size_t amt) {
  5756. return MaskFromVec(SlideUpLanes(d, VecFromMask(d, m), amt));
  5757. }
  5758. template <class D>
  5759. HWY_API Mask<D> SlideMaskDownLanes(D d, Mask<D> m, size_t amt) {
  5760. return MaskFromVec(SlideDownLanes(d, VecFromMask(d, m), amt));
  5761. }
  5762. #endif // HWY_NATIVE_SLIDE_MASK
  5763. // ------------------------------ SumsOfAdjQuadAbsDiff
  5764. #if (defined(HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF) == \
  5765. defined(HWY_TARGET_TOGGLE))
  5766. #ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
  5767. #undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
  5768. #else
  5769. #define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
  5770. #endif
  5771. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  5772. template <int kAOffset, int kBOffset, class V8, HWY_IF_UI8_D(DFromV<V8>)>
  5773. HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) {
  5774. static_assert(0 <= kAOffset && kAOffset <= 1,
  5775. "kAOffset must be between 0 and 1");
  5776. static_assert(0 <= kBOffset && kBOffset <= 3,
  5777. "kBOffset must be between 0 and 3");
  5778. using D8 = DFromV<V8>;
  5779. const D8 d8;
  5780. const RebindToUnsigned<decltype(d8)> du8;
  5781. const RepartitionToWide<decltype(d8)> d16;
  5782. const RepartitionToWide<decltype(du8)> du16;
  5783. // Ensure that a is resized to a vector that has at least
  5784. // HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the interleave and
  5785. // CombineShiftRightBytes operations below.
  5786. #if HWY_TARGET == HWY_RVV
  5787. // On RVV targets, need to ensure that d8_interleave.Pow2() >= 0 is true
  5788. // to ensure that Lanes(d8_interleave) >= 16 is true.
  5789. // Lanes(d8_interleave) >= Lanes(d8) is guaranteed to be true on RVV
  5790. // targets as d8_interleave.Pow2() >= d8.Pow2() is true.
  5791. constexpr int kInterleavePow2 = HWY_MAX(d8.Pow2(), 0);
  5792. const ScalableTag<TFromD<D8>, kInterleavePow2> d8_interleave;
  5793. #elif HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE
  5794. // On SVE targets, Lanes(d8_interleave) >= 16 and
  5795. // Lanes(d8_interleave) >= Lanes(d8) are both already true as d8 is a SIMD
  5796. // tag for a full u8/i8 vector on SVE.
  5797. const D8 d8_interleave;
  5798. #else
  5799. // On targets that use non-scalable vector types, Lanes(d8_interleave) is
  5800. // equal to HWY_MAX(Lanes(d8), size_t{8} << kAOffset).
  5801. constexpr size_t kInterleaveLanes =
  5802. HWY_MAX(HWY_MAX_LANES_D(D8), size_t{8} << kAOffset);
  5803. const FixedTag<TFromD<D8>, kInterleaveLanes> d8_interleave;
  5804. #endif
  5805. // The ResizeBitCast operation below will resize a to a vector that has
  5806. // at least HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the
  5807. // InterleaveLower, InterleaveUpper, and CombineShiftRightBytes operations
  5808. // below.
  5809. const auto a_to_interleave = ResizeBitCast(d8_interleave, a);
  5810. const auto a_interleaved_lo =
  5811. InterleaveLower(d8_interleave, a_to_interleave, a_to_interleave);
  5812. const auto a_interleaved_hi =
  5813. InterleaveUpper(d8_interleave, a_to_interleave, a_to_interleave);
  5814. /* a01: { a[kAOffset*4+0], a[kAOffset*4+1], a[kAOffset*4+1], a[kAOffset*4+2],
  5815. a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4],
  5816. a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6],
  5817. a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8] }
  5818. */
  5819. /* a23: { a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4],
  5820. a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6],
  5821. a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8],
  5822. a[kAOffset*4+8], a[kAOffset*4+9], a[kAOffset*4+9], a[kAOffset*4+10]
  5823. } */
  5824. // a01 and a23 are resized back to V8 as only the first Lanes(d8) lanes of
  5825. // the CombineShiftRightBytes are needed for the subsequent AbsDiff operations
  5826. // and as a01 and a23 need to be the same vector type as b01 and b23 for the
  5827. // AbsDiff operations below.
  5828. const V8 a01 =
  5829. ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 1>(
  5830. d8_interleave, a_interleaved_hi, a_interleaved_lo));
  5831. const V8 a23 =
  5832. ResizeBitCast(d8, CombineShiftRightBytes<kAOffset * 8 + 5>(
  5833. d8_interleave, a_interleaved_hi, a_interleaved_lo));
  5834. /* b01: { b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
  5835. b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
  5836. b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1],
  5837. b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1] }
  5838. */
  5839. /* b23: { b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
  5840. b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
  5841. b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3],
  5842. b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3] }
  5843. */
  5844. const V8 b01 = BitCast(d8, Broadcast<kBOffset * 2>(BitCast(d16, b)));
  5845. const V8 b23 = BitCast(d8, Broadcast<kBOffset * 2 + 1>(BitCast(d16, b)));
  5846. const VFromD<decltype(du16)> absdiff_sum_01 =
  5847. SumsOf2(BitCast(du8, AbsDiff(a01, b01)));
  5848. const VFromD<decltype(du16)> absdiff_sum_23 =
  5849. SumsOf2(BitCast(du8, AbsDiff(a23, b23)));
  5850. return BitCast(d16, Add(absdiff_sum_01, absdiff_sum_23));
  5851. }
  5852. #endif // HWY_TARGET != HWY_SCALAR
  5853. #endif // HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF
  5854. // ------------------------------ SumsOfShuffledQuadAbsDiff
  5855. #if (defined(HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF) == \
  5856. defined(HWY_TARGET_TOGGLE))
  5857. #ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
  5858. #undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
  5859. #else
  5860. #define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
  5861. #endif
  5862. #if HWY_TARGET != HWY_SCALAR || HWY_IDE
  5863. template <int kIdx3, int kIdx2, int kIdx1, int kIdx0, class V8,
  5864. HWY_IF_UI8_D(DFromV<V8>)>
  5865. HWY_API Vec<RepartitionToWide<DFromV<V8>>> SumsOfShuffledQuadAbsDiff(V8 a,
  5866. V8 b) {
  5867. static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3");
  5868. static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3");
  5869. static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3");
  5870. static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3");
  5871. #if HWY_TARGET == HWY_RVV
  5872. // On RVV, ensure that both vA and vB have a LMUL of at least 1/2 so that
  5873. // both vA and vB can be bitcasted to a u32 vector.
  5874. const detail::AdjustSimdTagToMinVecPow2<
  5875. RepartitionToWideX2<DFromV<decltype(a)>>>
  5876. d32;
  5877. const RepartitionToNarrow<decltype(d32)> d16;
  5878. const RepartitionToNarrow<decltype(d16)> d8;
  5879. const auto vA = ResizeBitCast(d8, a);
  5880. const auto vB = ResizeBitCast(d8, b);
  5881. #else
  5882. const DFromV<decltype(a)> d8;
  5883. const RepartitionToWide<decltype(d8)> d16;
  5884. const RepartitionToWide<decltype(d16)> d32;
  5885. const auto vA = a;
  5886. const auto vB = b;
  5887. #endif
  5888. const RebindToUnsigned<decltype(d8)> du8;
  5889. const auto a_shuf =
  5890. Per4LaneBlockShuffle<kIdx3, kIdx2, kIdx1, kIdx0>(BitCast(d32, vA));
  5891. /* a0123_2345: { a_shuf[0], a_shuf[1], a_shuf[2], a_shuf[3],
  5892. a_shuf[2], a_shuf[3], a_shuf[4], a_shuf[5],
  5893. a_shuf[8], a_shuf[9], a_shuf[10], a_shuf[11],
  5894. a_shuf[10], a_shuf[11], a_shuf[12], a_shuf[13] } */
  5895. /* a1234_3456: { a_shuf[1], a_shuf[2], a_shuf[3], a_shuf[4],
  5896. a_shuf[3], a_shuf[4], a_shuf[5], a_shuf[6],
  5897. a_shuf[9], a_shuf[10], a_shuf[11], a_shuf[12],
  5898. a_shuf[11], a_shuf[12], a_shuf[13], a_shuf[14] } */
  5899. #if HWY_HAVE_SCALABLE || HWY_TARGET_IS_SVE
  5900. // On RVV/SVE targets, use Slide1Up/Slide1Down instead of
  5901. // ShiftLeftBytes/ShiftRightBytes to avoid unnecessary zeroing out of any
  5902. // lanes that are shifted into an adjacent 16-byte block as any lanes that are
  5903. // shifted into an adjacent 16-byte block by Slide1Up/Slide1Down will be
  5904. // replaced by the OddEven operation.
  5905. const auto a_0123_2345 = BitCast(
  5906. d8, OddEven(BitCast(d32, Slide1Up(d16, BitCast(d16, a_shuf))), a_shuf));
  5907. const auto a_1234_3456 =
  5908. BitCast(d8, OddEven(BitCast(d32, Slide1Up(d8, BitCast(d8, a_shuf))),
  5909. BitCast(d32, Slide1Down(d8, BitCast(d8, a_shuf)))));
  5910. #else
  5911. const auto a_0123_2345 =
  5912. BitCast(d8, OddEven(ShiftLeftBytes<2>(d32, a_shuf), a_shuf));
  5913. const auto a_1234_3456 = BitCast(
  5914. d8,
  5915. OddEven(ShiftLeftBytes<1>(d32, a_shuf), ShiftRightBytes<1>(d32, a_shuf)));
  5916. #endif
  5917. auto even_sums = SumsOf4(BitCast(du8, AbsDiff(a_0123_2345, vB)));
  5918. auto odd_sums = SumsOf4(BitCast(du8, AbsDiff(a_1234_3456, vB)));
  5919. #if HWY_IS_LITTLE_ENDIAN
  5920. odd_sums = ShiftLeft<16>(odd_sums);
  5921. #else
  5922. even_sums = ShiftLeft<16>(even_sums);
  5923. #endif
  5924. const auto sums = OddEven(BitCast(d16, odd_sums), BitCast(d16, even_sums));
  5925. #if HWY_TARGET == HWY_RVV
  5926. return ResizeBitCast(RepartitionToWide<DFromV<V8>>(), sums);
  5927. #else
  5928. return sums;
  5929. #endif
  5930. }
  5931. #endif // HWY_TARGET != HWY_SCALAR
  5932. #endif // HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF
  5933. // ------------------------------ BitShuffle (Rol)
  5934. #if (defined(HWY_NATIVE_BITSHUFFLE) == defined(HWY_TARGET_TOGGLE))
  5935. #ifdef HWY_NATIVE_BITSHUFFLE
  5936. #undef HWY_NATIVE_BITSHUFFLE
  5937. #else
  5938. #define HWY_NATIVE_BITSHUFFLE
  5939. #endif
  5940. #if HWY_HAVE_INTEGER64 && HWY_TARGET != HWY_SCALAR
  5941. template <class V, class VI, HWY_IF_UI64(TFromV<V>), HWY_IF_UI8(TFromV<VI>)>
  5942. HWY_API V BitShuffle(V v, VI idx) {
  5943. const DFromV<decltype(v)> d64;
  5944. const RebindToUnsigned<decltype(d64)> du64;
  5945. const Repartition<uint8_t, decltype(d64)> du8;
  5946. #if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \
  5947. HWY_TARGET == HWY_WASM_EMU256
  5948. const Repartition<uint16_t, decltype(d64)> d_idx_shr;
  5949. #else
  5950. const Repartition<uint8_t, decltype(d64)> d_idx_shr;
  5951. #endif
  5952. #if HWY_IS_LITTLE_ENDIAN
  5953. constexpr uint64_t kExtractedBitsMask =
  5954. static_cast<uint64_t>(0x8040201008040201u);
  5955. #else
  5956. constexpr uint64_t kExtractedBitsMask =
  5957. static_cast<uint64_t>(0x0102040810204080u);
  5958. #endif
  5959. const auto byte_idx = BitwiseIfThenElse(
  5960. Set(du8, uint8_t{0x07}),
  5961. BitCast(du8, ShiftRight<3>(BitCast(d_idx_shr, idx))),
  5962. BitCast(du8, Dup128VecFromValues(du64, uint64_t{0},
  5963. uint64_t{0x0808080808080808u})));
  5964. // We want to shift right by idx & 7 to extract the desired bit in `bytes`,
  5965. // and left by iota & 7 to put it in the correct output bit. To correctly
  5966. // handle shift counts from -7 to 7, we rotate.
  5967. const auto rotate_left_bits = Sub(Iota(du8, uint8_t{0}), BitCast(du8, idx));
  5968. const auto extracted_bits =
  5969. And(Rol(TableLookupBytes(v, byte_idx), rotate_left_bits),
  5970. BitCast(du8, Set(du64, kExtractedBitsMask)));
  5971. // Combine bit-sliced (one bit per byte) into one 64-bit sum.
  5972. return BitCast(d64, SumsOf8(extracted_bits));
  5973. }
  5974. #endif // HWY_HAVE_INTEGER64 && HWY_TARGET != HWY_SCALAR
  5975. #endif // HWY_NATIVE_BITSHUFFLE
  5976. // ================================================== Operator wrapper
  5977. // SVE* and RVV currently cannot define operators and have already defined
  5978. // (only) the corresponding functions such as Add.
  5979. #if (defined(HWY_NATIVE_OPERATOR_REPLACEMENTS) == defined(HWY_TARGET_TOGGLE))
  5980. #ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS
  5981. #undef HWY_NATIVE_OPERATOR_REPLACEMENTS
  5982. #else
  5983. #define HWY_NATIVE_OPERATOR_REPLACEMENTS
  5984. #endif
  5985. template <class V>
  5986. HWY_API V Add(V a, V b) {
  5987. return a + b;
  5988. }
  5989. template <class V>
  5990. HWY_API V Sub(V a, V b) {
  5991. return a - b;
  5992. }
  5993. template <class V>
  5994. HWY_API V Mul(V a, V b) {
  5995. return a * b;
  5996. }
  5997. template <class V>
  5998. HWY_API V Div(V a, V b) {
  5999. return a / b;
  6000. }
  6001. template <class V>
  6002. HWY_API V Mod(V a, V b) {
  6003. return a % b;
  6004. }
  6005. template <class V>
  6006. V Shl(V a, V b) {
  6007. return a << b;
  6008. }
  6009. template <class V>
  6010. V Shr(V a, V b) {
  6011. return a >> b;
  6012. }
  6013. template <class V>
  6014. HWY_API auto Eq(V a, V b) -> decltype(a == b) {
  6015. return a == b;
  6016. }
  6017. template <class V>
  6018. HWY_API auto Ne(V a, V b) -> decltype(a == b) {
  6019. return a != b;
  6020. }
  6021. template <class V>
  6022. HWY_API auto Lt(V a, V b) -> decltype(a == b) {
  6023. return a < b;
  6024. }
  6025. template <class V>
  6026. HWY_API auto Gt(V a, V b) -> decltype(a == b) {
  6027. return a > b;
  6028. }
  6029. template <class V>
  6030. HWY_API auto Ge(V a, V b) -> decltype(a == b) {
  6031. return a >= b;
  6032. }
  6033. template <class V>
  6034. HWY_API auto Le(V a, V b) -> decltype(a == b) {
  6035. return a <= b;
  6036. }
  6037. #endif // HWY_NATIVE_OPERATOR_REPLACEMENTS
  6038. // NOLINTNEXTLINE(google-readability-namespace-comments)
  6039. } // namespace HWY_NAMESPACE
  6040. } // namespace hwy
  6041. HWY_AFTER_NAMESPACE();