row_win.cc 202 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. // This module is for Visual C 32/64 bit and clangcl 32 bit
  12. #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
  13. (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
  14. #if defined(_M_X64)
  15. #include <emmintrin.h>
  16. #include <tmmintrin.h> // For _mm_maddubs_epi16
  17. #endif
  18. #ifdef __cplusplus
  19. namespace libyuv {
  20. extern "C" {
  21. #endif
  22. // 64 bit
  23. #if defined(_M_X64)
  24. // Read 4 UV from 422, upsample to 8 UV.
  25. #define READYUV422 \
  26. xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
  27. xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
  28. xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
  29. xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
  30. u_buf += 4; \
  31. xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
  32. xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
  33. y_buf += 8;
  34. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
  35. #define READYUVA422 \
  36. xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
  37. xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
  38. xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
  39. xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
  40. u_buf += 4; \
  41. xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
  42. xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
  43. y_buf += 8; \
  44. xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
  45. a_buf += 8;
  46. // Convert 8 pixels: 8 UV and 8 Y.
  47. #define YUVTORGB(yuvconstants) \
  48. xmm1 = _mm_loadu_si128(&xmm0); \
  49. xmm2 = _mm_loadu_si128(&xmm0); \
  50. xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
  51. xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
  52. xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
  53. xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
  54. xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
  55. xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
  56. xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
  57. xmm0 = _mm_adds_epi16(xmm0, xmm4); \
  58. xmm1 = _mm_adds_epi16(xmm1, xmm4); \
  59. xmm2 = _mm_adds_epi16(xmm2, xmm4); \
  60. xmm0 = _mm_srai_epi16(xmm0, 6); \
  61. xmm1 = _mm_srai_epi16(xmm1, 6); \
  62. xmm2 = _mm_srai_epi16(xmm2, 6); \
  63. xmm0 = _mm_packus_epi16(xmm0, xmm0); \
  64. xmm1 = _mm_packus_epi16(xmm1, xmm1); \
  65. xmm2 = _mm_packus_epi16(xmm2, xmm2);
  66. // Store 8 ARGB values.
  67. #define STOREARGB \
  68. xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
  69. xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
  70. xmm1 = _mm_loadu_si128(&xmm0); \
  71. xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
  72. xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
  73. _mm_storeu_si128((__m128i*)dst_argb, xmm0); \
  74. _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
  75. dst_argb += 32;
  76. #if defined(HAS_I422TOARGBROW_SSSE3)
  77. void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
  78. const uint8_t* u_buf,
  79. const uint8_t* v_buf,
  80. uint8_t* dst_argb,
  81. const struct YuvConstants* yuvconstants,
  82. int width) {
  83. __m128i xmm0, xmm1, xmm2, xmm4;
  84. const __m128i xmm5 = _mm_set1_epi8(-1);
  85. const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
  86. while (width > 0) {
  87. READYUV422
  88. YUVTORGB(yuvconstants)
  89. STOREARGB
  90. width -= 8;
  91. }
  92. }
  93. #endif
  94. #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
  95. void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
  96. const uint8_t* u_buf,
  97. const uint8_t* v_buf,
  98. const uint8_t* a_buf,
  99. uint8_t* dst_argb,
  100. const struct YuvConstants* yuvconstants,
  101. int width) {
  102. __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
  103. const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
  104. while (width > 0) {
  105. READYUVA422
  106. YUVTORGB(yuvconstants)
  107. STOREARGB
  108. width -= 8;
  109. }
  110. }
  111. #endif
  112. // 32 bit
  113. #else // defined(_M_X64)
  114. #ifdef HAS_ARGBTOYROW_SSSE3
  115. // Constants for ARGB.
  116. static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
  117. 13, 65, 33, 0, 13, 65, 33, 0};
  118. // JPeg full range.
  119. static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
  120. 15, 75, 38, 0, 15, 75, 38, 0};
  121. static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
  122. 112, -74, -38, 0, 112, -74, -38, 0};
  123. static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
  124. 127, -84, -43, 0, 127, -84, -43, 0};
  125. static const vec8 kARGBToV = {
  126. -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
  127. };
  128. static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
  129. -20, -107, 127, 0, -20, -107, 127, 0};
  130. // vpshufb for vphaddw + vpackuswb packed to shorts.
  131. static const lvec8 kShufARGBToUV_AVX = {
  132. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
  133. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
  134. // Constants for BGRA.
  135. static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
  136. 0, 33, 65, 13, 0, 33, 65, 13};
  137. static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
  138. 0, -38, -74, 112, 0, -38, -74, 112};
  139. static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
  140. 0, 112, -94, -18, 0, 112, -94, -18};
  141. // Constants for ABGR.
  142. static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
  143. 33, 65, 13, 0, 33, 65, 13, 0};
  144. static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
  145. -38, -74, 112, 0, -38, -74, 112, 0};
  146. static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
  147. 112, -94, -18, 0, 112, -94, -18, 0};
  148. // Constants for RGBA.
  149. static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
  150. 0, 13, 65, 33, 0, 13, 65, 33};
  151. static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
  152. 0, 112, -74, -38, 0, 112, -74, -38};
  153. static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
  154. 0, -18, -94, 112, 0, -18, -94, 112};
  155. static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
  156. 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
  157. // 7 bit fixed point 0.5.
  158. static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
  159. static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
  160. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  161. static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
  162. 0x8080u, 0x8080u, 0x8080u, 0x8080u};
  163. // Shuffle table for converting RGB24 to ARGB.
  164. static const uvec8 kShuffleMaskRGB24ToARGB = {
  165. 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
  166. // Shuffle table for converting RAW to ARGB.
  167. static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
  168. 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
  169. // Shuffle table for converting RAW to RGB24. First 8.
  170. static const uvec8 kShuffleMaskRAWToRGB24_0 = {
  171. 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
  172. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  173. // Shuffle table for converting RAW to RGB24. Middle 8.
  174. static const uvec8 kShuffleMaskRAWToRGB24_1 = {
  175. 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
  176. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  177. // Shuffle table for converting RAW to RGB24. Last 8.
  178. static const uvec8 kShuffleMaskRAWToRGB24_2 = {
  179. 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
  180. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  181. // Shuffle table for converting ARGB to RGB24.
  182. static const uvec8 kShuffleMaskARGBToRGB24 = {
  183. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
  184. // Shuffle table for converting ARGB to RAW.
  185. static const uvec8 kShuffleMaskARGBToRAW = {
  186. 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
  187. // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
  188. static const uvec8 kShuffleMaskARGBToRGB24_0 = {
  189. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
  190. // YUY2 shuf 16 Y to 32 Y.
  191. static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
  192. 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
  193. 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
  194. // YUY2 shuf 8 UV to 16 UV.
  195. static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
  196. 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
  197. 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
  198. // UYVY shuf 16 Y to 32 Y.
  199. static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
  200. 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
  201. 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
  202. // UYVY shuf 8 UV to 16 UV.
  203. static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
  204. 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
  205. 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
  206. // NV21 shuf 8 VU to 16 UV.
  207. static const lvec8 kShuffleNV21 = {
  208. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  209. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  210. };
  211. // Duplicates gray value 3 times and fills in alpha opaque.
  212. __declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
  213. uint8_t* dst_argb,
  214. int width) {
  215. __asm {
  216. mov eax, [esp + 4] // src_y
  217. mov edx, [esp + 8] // dst_argb
  218. mov ecx, [esp + 12] // width
  219. pcmpeqb xmm5, xmm5 // generate mask 0xff000000
  220. pslld xmm5, 24
  221. convertloop:
  222. movq xmm0, qword ptr [eax]
  223. lea eax, [eax + 8]
  224. punpcklbw xmm0, xmm0
  225. movdqa xmm1, xmm0
  226. punpcklwd xmm0, xmm0
  227. punpckhwd xmm1, xmm1
  228. por xmm0, xmm5
  229. por xmm1, xmm5
  230. movdqu [edx], xmm0
  231. movdqu [edx + 16], xmm1
  232. lea edx, [edx + 32]
  233. sub ecx, 8
  234. jg convertloop
  235. ret
  236. }
  237. }
  238. #ifdef HAS_J400TOARGBROW_AVX2
  239. // Duplicates gray value 3 times and fills in alpha opaque.
  240. __declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
  241. uint8_t* dst_argb,
  242. int width) {
  243. __asm {
  244. mov eax, [esp + 4] // src_y
  245. mov edx, [esp + 8] // dst_argb
  246. mov ecx, [esp + 12] // width
  247. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
  248. vpslld ymm5, ymm5, 24
  249. convertloop:
  250. vmovdqu xmm0, [eax]
  251. lea eax, [eax + 16]
  252. vpermq ymm0, ymm0, 0xd8
  253. vpunpcklbw ymm0, ymm0, ymm0
  254. vpermq ymm0, ymm0, 0xd8
  255. vpunpckhwd ymm1, ymm0, ymm0
  256. vpunpcklwd ymm0, ymm0, ymm0
  257. vpor ymm0, ymm0, ymm5
  258. vpor ymm1, ymm1, ymm5
  259. vmovdqu [edx], ymm0
  260. vmovdqu [edx + 32], ymm1
  261. lea edx, [edx + 64]
  262. sub ecx, 16
  263. jg convertloop
  264. vzeroupper
  265. ret
  266. }
  267. }
  268. #endif // HAS_J400TOARGBROW_AVX2
  269. __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
  270. uint8_t* dst_argb,
  271. int width) {
  272. __asm {
  273. mov eax, [esp + 4] // src_rgb24
  274. mov edx, [esp + 8] // dst_argb
  275. mov ecx, [esp + 12] // width
  276. pcmpeqb xmm5, xmm5 // generate mask 0xff000000
  277. pslld xmm5, 24
  278. movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
  279. convertloop:
  280. movdqu xmm0, [eax]
  281. movdqu xmm1, [eax + 16]
  282. movdqu xmm3, [eax + 32]
  283. lea eax, [eax + 48]
  284. movdqa xmm2, xmm3
  285. palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
  286. pshufb xmm2, xmm4
  287. por xmm2, xmm5
  288. palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
  289. pshufb xmm0, xmm4
  290. movdqu [edx + 32], xmm2
  291. por xmm0, xmm5
  292. pshufb xmm1, xmm4
  293. movdqu [edx], xmm0
  294. por xmm1, xmm5
  295. palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
  296. pshufb xmm3, xmm4
  297. movdqu [edx + 16], xmm1
  298. por xmm3, xmm5
  299. movdqu [edx + 48], xmm3
  300. lea edx, [edx + 64]
  301. sub ecx, 16
  302. jg convertloop
  303. ret
  304. }
  305. }
  306. __declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
  307. uint8_t* dst_argb,
  308. int width) {
  309. __asm {
  310. mov eax, [esp + 4] // src_raw
  311. mov edx, [esp + 8] // dst_argb
  312. mov ecx, [esp + 12] // width
  313. pcmpeqb xmm5, xmm5 // generate mask 0xff000000
  314. pslld xmm5, 24
  315. movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
  316. convertloop:
  317. movdqu xmm0, [eax]
  318. movdqu xmm1, [eax + 16]
  319. movdqu xmm3, [eax + 32]
  320. lea eax, [eax + 48]
  321. movdqa xmm2, xmm3
  322. palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
  323. pshufb xmm2, xmm4
  324. por xmm2, xmm5
  325. palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
  326. pshufb xmm0, xmm4
  327. movdqu [edx + 32], xmm2
  328. por xmm0, xmm5
  329. pshufb xmm1, xmm4
  330. movdqu [edx], xmm0
  331. por xmm1, xmm5
  332. palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
  333. pshufb xmm3, xmm4
  334. movdqu [edx + 16], xmm1
  335. por xmm3, xmm5
  336. movdqu [edx + 48], xmm3
  337. lea edx, [edx + 64]
  338. sub ecx, 16
  339. jg convertloop
  340. ret
  341. }
  342. }
  343. __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
  344. uint8_t* dst_rgb24,
  345. int width) {
  346. __asm {
  347. mov eax, [esp + 4] // src_raw
  348. mov edx, [esp + 8] // dst_rgb24
  349. mov ecx, [esp + 12] // width
  350. movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
  351. movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
  352. movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
  353. convertloop:
  354. movdqu xmm0, [eax]
  355. movdqu xmm1, [eax + 4]
  356. movdqu xmm2, [eax + 8]
  357. lea eax, [eax + 24]
  358. pshufb xmm0, xmm3
  359. pshufb xmm1, xmm4
  360. pshufb xmm2, xmm5
  361. movq qword ptr [edx], xmm0
  362. movq qword ptr [edx + 8], xmm1
  363. movq qword ptr [edx + 16], xmm2
  364. lea edx, [edx + 24]
  365. sub ecx, 8
  366. jg convertloop
  367. ret
  368. }
  369. }
  370. // pmul method to replicate bits.
  371. // Math to replicate bits:
  372. // (v << 8) | (v << 3)
  373. // v * 256 + v * 8
  374. // v * (256 + 8)
  375. // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
  376. // 20 instructions.
  377. __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
  378. uint8_t* dst_argb,
  379. int width) {
  380. __asm {
  381. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  382. movd xmm5, eax
  383. pshufd xmm5, xmm5, 0
  384. mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
  385. movd xmm6, eax
  386. pshufd xmm6, xmm6, 0
  387. pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
  388. psllw xmm3, 11
  389. pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
  390. psllw xmm4, 10
  391. psrlw xmm4, 5
  392. pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
  393. psllw xmm7, 8
  394. mov eax, [esp + 4] // src_rgb565
  395. mov edx, [esp + 8] // dst_argb
  396. mov ecx, [esp + 12] // width
  397. sub edx, eax
  398. sub edx, eax
  399. convertloop:
  400. movdqu xmm0, [eax] // fetch 8 pixels of bgr565
  401. movdqa xmm1, xmm0
  402. movdqa xmm2, xmm0
  403. pand xmm1, xmm3 // R in upper 5 bits
  404. psllw xmm2, 11 // B in upper 5 bits
  405. pmulhuw xmm1, xmm5 // * (256 + 8)
  406. pmulhuw xmm2, xmm5 // * (256 + 8)
  407. psllw xmm1, 8
  408. por xmm1, xmm2 // RB
  409. pand xmm0, xmm4 // G in middle 6 bits
  410. pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
  411. por xmm0, xmm7 // AG
  412. movdqa xmm2, xmm1
  413. punpcklbw xmm1, xmm0
  414. punpckhbw xmm2, xmm0
  415. movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
  416. movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
  417. lea eax, [eax + 16]
  418. sub ecx, 8
  419. jg convertloop
  420. ret
  421. }
  422. }
  423. #ifdef HAS_RGB565TOARGBROW_AVX2
  424. // pmul method to replicate bits.
  425. // Math to replicate bits:
  426. // (v << 8) | (v << 3)
  427. // v * 256 + v * 8
  428. // v * (256 + 8)
  429. // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
  430. __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
  431. uint8_t* dst_argb,
  432. int width) {
  433. __asm {
  434. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  435. vmovd xmm5, eax
  436. vbroadcastss ymm5, xmm5
  437. mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
  438. vmovd xmm6, eax
  439. vbroadcastss ymm6, xmm6
  440. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
  441. vpsllw ymm3, ymm3, 11
  442. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
  443. vpsllw ymm4, ymm4, 10
  444. vpsrlw ymm4, ymm4, 5
  445. vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
  446. vpsllw ymm7, ymm7, 8
  447. mov eax, [esp + 4] // src_rgb565
  448. mov edx, [esp + 8] // dst_argb
  449. mov ecx, [esp + 12] // width
  450. sub edx, eax
  451. sub edx, eax
  452. convertloop:
  453. vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
  454. vpand ymm1, ymm0, ymm3 // R in upper 5 bits
  455. vpsllw ymm2, ymm0, 11 // B in upper 5 bits
  456. vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
  457. vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
  458. vpsllw ymm1, ymm1, 8
  459. vpor ymm1, ymm1, ymm2 // RB
  460. vpand ymm0, ymm0, ymm4 // G in middle 6 bits
  461. vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
  462. vpor ymm0, ymm0, ymm7 // AG
  463. vpermq ymm0, ymm0, 0xd8 // mutate for unpack
  464. vpermq ymm1, ymm1, 0xd8
  465. vpunpckhbw ymm2, ymm1, ymm0
  466. vpunpcklbw ymm1, ymm1, ymm0
  467. vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
  468. vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
  469. lea eax, [eax + 32]
  470. sub ecx, 16
  471. jg convertloop
  472. vzeroupper
  473. ret
  474. }
  475. }
  476. #endif // HAS_RGB565TOARGBROW_AVX2
  477. #ifdef HAS_ARGB1555TOARGBROW_AVX2
  478. __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
  479. uint8_t* dst_argb,
  480. int width) {
  481. __asm {
  482. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  483. vmovd xmm5, eax
  484. vbroadcastss ymm5, xmm5
  485. mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
  486. vmovd xmm6, eax
  487. vbroadcastss ymm6, xmm6
  488. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
  489. vpsllw ymm3, ymm3, 11
  490. vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
  491. vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
  492. vpsllw ymm7, ymm7, 8
  493. mov eax, [esp + 4] // src_argb1555
  494. mov edx, [esp + 8] // dst_argb
  495. mov ecx, [esp + 12] // width
  496. sub edx, eax
  497. sub edx, eax
  498. convertloop:
  499. vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
  500. vpsllw ymm1, ymm0, 1 // R in upper 5 bits
  501. vpsllw ymm2, ymm0, 11 // B in upper 5 bits
  502. vpand ymm1, ymm1, ymm3
  503. vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
  504. vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
  505. vpsllw ymm1, ymm1, 8
  506. vpor ymm1, ymm1, ymm2 // RB
  507. vpsraw ymm2, ymm0, 8 // A
  508. vpand ymm0, ymm0, ymm4 // G in middle 5 bits
  509. vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
  510. vpand ymm2, ymm2, ymm7
  511. vpor ymm0, ymm0, ymm2 // AG
  512. vpermq ymm0, ymm0, 0xd8 // mutate for unpack
  513. vpermq ymm1, ymm1, 0xd8
  514. vpunpckhbw ymm2, ymm1, ymm0
  515. vpunpcklbw ymm1, ymm1, ymm0
  516. vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
  517. vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
  518. lea eax, [eax + 32]
  519. sub ecx, 16
  520. jg convertloop
  521. vzeroupper
  522. ret
  523. }
  524. }
  525. #endif // HAS_ARGB1555TOARGBROW_AVX2
  526. #ifdef HAS_ARGB4444TOARGBROW_AVX2
  527. __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
  528. uint8_t* dst_argb,
  529. int width) {
  530. __asm {
  531. mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
  532. vmovd xmm4, eax
  533. vbroadcastss ymm4, xmm4
  534. vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
  535. mov eax, [esp + 4] // src_argb4444
  536. mov edx, [esp + 8] // dst_argb
  537. mov ecx, [esp + 12] // width
  538. sub edx, eax
  539. sub edx, eax
  540. convertloop:
  541. vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
  542. vpand ymm2, ymm0, ymm5 // mask high nibbles
  543. vpand ymm0, ymm0, ymm4 // mask low nibbles
  544. vpsrlw ymm3, ymm2, 4
  545. vpsllw ymm1, ymm0, 4
  546. vpor ymm2, ymm2, ymm3
  547. vpor ymm0, ymm0, ymm1
  548. vpermq ymm0, ymm0, 0xd8 // mutate for unpack
  549. vpermq ymm2, ymm2, 0xd8
  550. vpunpckhbw ymm1, ymm0, ymm2
  551. vpunpcklbw ymm0, ymm0, ymm2
  552. vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
  553. vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
  554. lea eax, [eax + 32]
  555. sub ecx, 16
  556. jg convertloop
  557. vzeroupper
  558. ret
  559. }
  560. }
  561. #endif // HAS_ARGB4444TOARGBROW_AVX2
  562. // 24 instructions
  563. __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
  564. uint8_t* dst_argb,
  565. int width) {
  566. __asm {
  567. mov eax, 0x01080108 // generate multiplier to repeat 5 bits
  568. movd xmm5, eax
  569. pshufd xmm5, xmm5, 0
  570. mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
  571. movd xmm6, eax
  572. pshufd xmm6, xmm6, 0
  573. pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
  574. psllw xmm3, 11
  575. movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
  576. psrlw xmm4, 6
  577. pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
  578. psllw xmm7, 8
  579. mov eax, [esp + 4] // src_argb1555
  580. mov edx, [esp + 8] // dst_argb
  581. mov ecx, [esp + 12] // width
  582. sub edx, eax
  583. sub edx, eax
  584. convertloop:
  585. movdqu xmm0, [eax] // fetch 8 pixels of 1555
  586. movdqa xmm1, xmm0
  587. movdqa xmm2, xmm0
  588. psllw xmm1, 1 // R in upper 5 bits
  589. psllw xmm2, 11 // B in upper 5 bits
  590. pand xmm1, xmm3
  591. pmulhuw xmm2, xmm5 // * (256 + 8)
  592. pmulhuw xmm1, xmm5 // * (256 + 8)
  593. psllw xmm1, 8
  594. por xmm1, xmm2 // RB
  595. movdqa xmm2, xmm0
  596. pand xmm0, xmm4 // G in middle 5 bits
  597. psraw xmm2, 8 // A
  598. pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
  599. pand xmm2, xmm7
  600. por xmm0, xmm2 // AG
  601. movdqa xmm2, xmm1
  602. punpcklbw xmm1, xmm0
  603. punpckhbw xmm2, xmm0
  604. movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
  605. movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
  606. lea eax, [eax + 16]
  607. sub ecx, 8
  608. jg convertloop
  609. ret
  610. }
  611. }
  612. // 18 instructions.
  613. __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
  614. uint8_t* dst_argb,
  615. int width) {
  616. __asm {
  617. mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
  618. movd xmm4, eax
  619. pshufd xmm4, xmm4, 0
  620. movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
  621. pslld xmm5, 4
  622. mov eax, [esp + 4] // src_argb4444
  623. mov edx, [esp + 8] // dst_argb
  624. mov ecx, [esp + 12] // width
  625. sub edx, eax
  626. sub edx, eax
  627. convertloop:
  628. movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
  629. movdqa xmm2, xmm0
  630. pand xmm0, xmm4 // mask low nibbles
  631. pand xmm2, xmm5 // mask high nibbles
  632. movdqa xmm1, xmm0
  633. movdqa xmm3, xmm2
  634. psllw xmm1, 4
  635. psrlw xmm3, 4
  636. por xmm0, xmm1
  637. por xmm2, xmm3
  638. movdqa xmm1, xmm0
  639. punpcklbw xmm0, xmm2
  640. punpckhbw xmm1, xmm2
  641. movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
  642. movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
  643. lea eax, [eax + 16]
  644. sub ecx, 8
  645. jg convertloop
  646. ret
  647. }
  648. }
  649. __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
  650. uint8_t* dst_rgb,
  651. int width) {
  652. __asm {
  653. mov eax, [esp + 4] // src_argb
  654. mov edx, [esp + 8] // dst_rgb
  655. mov ecx, [esp + 12] // width
  656. movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
  657. convertloop:
  658. movdqu xmm0, [eax] // fetch 16 pixels of argb
  659. movdqu xmm1, [eax + 16]
  660. movdqu xmm2, [eax + 32]
  661. movdqu xmm3, [eax + 48]
  662. lea eax, [eax + 64]
  663. pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
  664. pshufb xmm1, xmm6
  665. pshufb xmm2, xmm6
  666. pshufb xmm3, xmm6
  667. movdqa xmm4, xmm1 // 4 bytes from 1 for 0
  668. psrldq xmm1, 4 // 8 bytes from 1
  669. pslldq xmm4, 12 // 4 bytes from 1 for 0
  670. movdqa xmm5, xmm2 // 8 bytes from 2 for 1
  671. por xmm0, xmm4 // 4 bytes from 1 for 0
  672. pslldq xmm5, 8 // 8 bytes from 2 for 1
  673. movdqu [edx], xmm0 // store 0
  674. por xmm1, xmm5 // 8 bytes from 2 for 1
  675. psrldq xmm2, 8 // 4 bytes from 2
  676. pslldq xmm3, 4 // 12 bytes from 3 for 2
  677. por xmm2, xmm3 // 12 bytes from 3 for 2
  678. movdqu [edx + 16], xmm1 // store 1
  679. movdqu [edx + 32], xmm2 // store 2
  680. lea edx, [edx + 48]
  681. sub ecx, 16
  682. jg convertloop
  683. ret
  684. }
  685. }
  686. __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
  687. uint8_t* dst_rgb,
  688. int width) {
  689. __asm {
  690. mov eax, [esp + 4] // src_argb
  691. mov edx, [esp + 8] // dst_rgb
  692. mov ecx, [esp + 12] // width
  693. movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
  694. convertloop:
  695. movdqu xmm0, [eax] // fetch 16 pixels of argb
  696. movdqu xmm1, [eax + 16]
  697. movdqu xmm2, [eax + 32]
  698. movdqu xmm3, [eax + 48]
  699. lea eax, [eax + 64]
  700. pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
  701. pshufb xmm1, xmm6
  702. pshufb xmm2, xmm6
  703. pshufb xmm3, xmm6
  704. movdqa xmm4, xmm1 // 4 bytes from 1 for 0
  705. psrldq xmm1, 4 // 8 bytes from 1
  706. pslldq xmm4, 12 // 4 bytes from 1 for 0
  707. movdqa xmm5, xmm2 // 8 bytes from 2 for 1
  708. por xmm0, xmm4 // 4 bytes from 1 for 0
  709. pslldq xmm5, 8 // 8 bytes from 2 for 1
  710. movdqu [edx], xmm0 // store 0
  711. por xmm1, xmm5 // 8 bytes from 2 for 1
  712. psrldq xmm2, 8 // 4 bytes from 2
  713. pslldq xmm3, 4 // 12 bytes from 3 for 2
  714. por xmm2, xmm3 // 12 bytes from 3 for 2
  715. movdqu [edx + 16], xmm1 // store 1
  716. movdqu [edx + 32], xmm2 // store 2
  717. lea edx, [edx + 48]
  718. sub ecx, 16
  719. jg convertloop
  720. ret
  721. }
  722. }
  723. __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
  724. uint8_t* dst_rgb,
  725. int width) {
  726. __asm {
  727. mov eax, [esp + 4] // src_argb
  728. mov edx, [esp + 8] // dst_rgb
  729. mov ecx, [esp + 12] // width
  730. pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
  731. psrld xmm3, 27
  732. pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
  733. psrld xmm4, 26
  734. pslld xmm4, 5
  735. pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
  736. pslld xmm5, 11
  737. convertloop:
  738. movdqu xmm0, [eax] // fetch 4 pixels of argb
  739. movdqa xmm1, xmm0 // B
  740. movdqa xmm2, xmm0 // G
  741. pslld xmm0, 8 // R
  742. psrld xmm1, 3 // B
  743. psrld xmm2, 5 // G
  744. psrad xmm0, 16 // R
  745. pand xmm1, xmm3 // B
  746. pand xmm2, xmm4 // G
  747. pand xmm0, xmm5 // R
  748. por xmm1, xmm2 // BG
  749. por xmm0, xmm1 // BGR
  750. packssdw xmm0, xmm0
  751. lea eax, [eax + 16]
  752. movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
  753. lea edx, [edx + 8]
  754. sub ecx, 4
  755. jg convertloop
  756. ret
  757. }
  758. }
  759. __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
  760. uint8_t* dst_rgb,
  761. const uint32_t dither4,
  762. int width) {
  763. __asm {
  764. mov eax, [esp + 4] // src_argb
  765. mov edx, [esp + 8] // dst_rgb
  766. movd xmm6, [esp + 12] // dither4
  767. mov ecx, [esp + 16] // width
  768. punpcklbw xmm6, xmm6 // make dither 16 bytes
  769. movdqa xmm7, xmm6
  770. punpcklwd xmm6, xmm6
  771. punpckhwd xmm7, xmm7
  772. pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
  773. psrld xmm3, 27
  774. pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
  775. psrld xmm4, 26
  776. pslld xmm4, 5
  777. pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
  778. pslld xmm5, 11
  779. convertloop:
  780. movdqu xmm0, [eax] // fetch 4 pixels of argb
  781. paddusb xmm0, xmm6 // add dither
  782. movdqa xmm1, xmm0 // B
  783. movdqa xmm2, xmm0 // G
  784. pslld xmm0, 8 // R
  785. psrld xmm1, 3 // B
  786. psrld xmm2, 5 // G
  787. psrad xmm0, 16 // R
  788. pand xmm1, xmm3 // B
  789. pand xmm2, xmm4 // G
  790. pand xmm0, xmm5 // R
  791. por xmm1, xmm2 // BG
  792. por xmm0, xmm1 // BGR
  793. packssdw xmm0, xmm0
  794. lea eax, [eax + 16]
  795. movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
  796. lea edx, [edx + 8]
  797. sub ecx, 4
  798. jg convertloop
  799. ret
  800. }
  801. }
  802. #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
  803. __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
  804. uint8_t* dst_rgb,
  805. const uint32_t dither4,
  806. int width) {
  807. __asm {
  808. mov eax, [esp + 4] // src_argb
  809. mov edx, [esp + 8] // dst_rgb
  810. vbroadcastss xmm6, [esp + 12] // dither4
  811. mov ecx, [esp + 16] // width
  812. vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
  813. vpermq ymm6, ymm6, 0xd8
  814. vpunpcklwd ymm6, ymm6, ymm6
  815. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
  816. vpsrld ymm3, ymm3, 27
  817. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
  818. vpsrld ymm4, ymm4, 26
  819. vpslld ymm4, ymm4, 5
  820. vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
  821. convertloop:
  822. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  823. vpaddusb ymm0, ymm0, ymm6 // add dither
  824. vpsrld ymm2, ymm0, 5 // G
  825. vpsrld ymm1, ymm0, 3 // B
  826. vpsrld ymm0, ymm0, 8 // R
  827. vpand ymm2, ymm2, ymm4 // G
  828. vpand ymm1, ymm1, ymm3 // B
  829. vpand ymm0, ymm0, ymm5 // R
  830. vpor ymm1, ymm1, ymm2 // BG
  831. vpor ymm0, ymm0, ymm1 // BGR
  832. vpackusdw ymm0, ymm0, ymm0
  833. vpermq ymm0, ymm0, 0xd8
  834. lea eax, [eax + 32]
  835. vmovdqu [edx], xmm0 // store 8 pixels of RGB565
  836. lea edx, [edx + 16]
  837. sub ecx, 8
  838. jg convertloop
  839. vzeroupper
  840. ret
  841. }
  842. }
  843. #endif // HAS_ARGBTORGB565DITHERROW_AVX2
  844. // TODO(fbarchard): Improve sign extension/packing.
  845. __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
  846. uint8_t* dst_rgb,
  847. int width) {
  848. __asm {
  849. mov eax, [esp + 4] // src_argb
  850. mov edx, [esp + 8] // dst_rgb
  851. mov ecx, [esp + 12] // width
  852. pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
  853. psrld xmm4, 27
  854. movdqa xmm5, xmm4 // generate mask 0x000003e0
  855. pslld xmm5, 5
  856. movdqa xmm6, xmm4 // generate mask 0x00007c00
  857. pslld xmm6, 10
  858. pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
  859. pslld xmm7, 15
  860. convertloop:
  861. movdqu xmm0, [eax] // fetch 4 pixels of argb
  862. movdqa xmm1, xmm0 // B
  863. movdqa xmm2, xmm0 // G
  864. movdqa xmm3, xmm0 // R
  865. psrad xmm0, 16 // A
  866. psrld xmm1, 3 // B
  867. psrld xmm2, 6 // G
  868. psrld xmm3, 9 // R
  869. pand xmm0, xmm7 // A
  870. pand xmm1, xmm4 // B
  871. pand xmm2, xmm5 // G
  872. pand xmm3, xmm6 // R
  873. por xmm0, xmm1 // BA
  874. por xmm2, xmm3 // GR
  875. por xmm0, xmm2 // BGRA
  876. packssdw xmm0, xmm0
  877. lea eax, [eax + 16]
  878. movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
  879. lea edx, [edx + 8]
  880. sub ecx, 4
  881. jg convertloop
  882. ret
  883. }
  884. }
  885. __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
  886. uint8_t* dst_rgb,
  887. int width) {
  888. __asm {
  889. mov eax, [esp + 4] // src_argb
  890. mov edx, [esp + 8] // dst_rgb
  891. mov ecx, [esp + 12] // width
  892. pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
  893. psllw xmm4, 12
  894. movdqa xmm3, xmm4 // generate mask 0x00f000f0
  895. psrlw xmm3, 8
  896. convertloop:
  897. movdqu xmm0, [eax] // fetch 4 pixels of argb
  898. movdqa xmm1, xmm0
  899. pand xmm0, xmm3 // low nibble
  900. pand xmm1, xmm4 // high nibble
  901. psrld xmm0, 4
  902. psrld xmm1, 8
  903. por xmm0, xmm1
  904. packuswb xmm0, xmm0
  905. lea eax, [eax + 16]
  906. movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
  907. lea edx, [edx + 8]
  908. sub ecx, 4
  909. jg convertloop
  910. ret
  911. }
  912. }
  913. #ifdef HAS_ARGBTORGB565ROW_AVX2
  914. __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
  915. uint8_t* dst_rgb,
  916. int width) {
  917. __asm {
  918. mov eax, [esp + 4] // src_argb
  919. mov edx, [esp + 8] // dst_rgb
  920. mov ecx, [esp + 12] // width
  921. vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
  922. vpsrld ymm3, ymm3, 27
  923. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
  924. vpsrld ymm4, ymm4, 26
  925. vpslld ymm4, ymm4, 5
  926. vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
  927. convertloop:
  928. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  929. vpsrld ymm2, ymm0, 5 // G
  930. vpsrld ymm1, ymm0, 3 // B
  931. vpsrld ymm0, ymm0, 8 // R
  932. vpand ymm2, ymm2, ymm4 // G
  933. vpand ymm1, ymm1, ymm3 // B
  934. vpand ymm0, ymm0, ymm5 // R
  935. vpor ymm1, ymm1, ymm2 // BG
  936. vpor ymm0, ymm0, ymm1 // BGR
  937. vpackusdw ymm0, ymm0, ymm0
  938. vpermq ymm0, ymm0, 0xd8
  939. lea eax, [eax + 32]
  940. vmovdqu [edx], xmm0 // store 8 pixels of RGB565
  941. lea edx, [edx + 16]
  942. sub ecx, 8
  943. jg convertloop
  944. vzeroupper
  945. ret
  946. }
  947. }
  948. #endif // HAS_ARGBTORGB565ROW_AVX2
  949. #ifdef HAS_ARGBTOARGB1555ROW_AVX2
  950. __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
  951. uint8_t* dst_rgb,
  952. int width) {
  953. __asm {
  954. mov eax, [esp + 4] // src_argb
  955. mov edx, [esp + 8] // dst_rgb
  956. mov ecx, [esp + 12] // width
  957. vpcmpeqb ymm4, ymm4, ymm4
  958. vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
  959. vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
  960. vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
  961. vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
  962. vpslld ymm7, ymm7, 15
  963. convertloop:
  964. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  965. vpsrld ymm3, ymm0, 9 // R
  966. vpsrld ymm2, ymm0, 6 // G
  967. vpsrld ymm1, ymm0, 3 // B
  968. vpsrad ymm0, ymm0, 16 // A
  969. vpand ymm3, ymm3, ymm6 // R
  970. vpand ymm2, ymm2, ymm5 // G
  971. vpand ymm1, ymm1, ymm4 // B
  972. vpand ymm0, ymm0, ymm7 // A
  973. vpor ymm0, ymm0, ymm1 // BA
  974. vpor ymm2, ymm2, ymm3 // GR
  975. vpor ymm0, ymm0, ymm2 // BGRA
  976. vpackssdw ymm0, ymm0, ymm0
  977. vpermq ymm0, ymm0, 0xd8
  978. lea eax, [eax + 32]
  979. vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
  980. lea edx, [edx + 16]
  981. sub ecx, 8
  982. jg convertloop
  983. vzeroupper
  984. ret
  985. }
  986. }
  987. #endif // HAS_ARGBTOARGB1555ROW_AVX2
  988. #ifdef HAS_ARGBTOARGB4444ROW_AVX2
  989. __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
  990. uint8_t* dst_rgb,
  991. int width) {
  992. __asm {
  993. mov eax, [esp + 4] // src_argb
  994. mov edx, [esp + 8] // dst_rgb
  995. mov ecx, [esp + 12] // width
  996. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
  997. vpsllw ymm4, ymm4, 12
  998. vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
  999. convertloop:
  1000. vmovdqu ymm0, [eax] // fetch 8 pixels of argb
  1001. vpand ymm1, ymm0, ymm4 // high nibble
  1002. vpand ymm0, ymm0, ymm3 // low nibble
  1003. vpsrld ymm1, ymm1, 8
  1004. vpsrld ymm0, ymm0, 4
  1005. vpor ymm0, ymm0, ymm1
  1006. vpackuswb ymm0, ymm0, ymm0
  1007. vpermq ymm0, ymm0, 0xd8
  1008. lea eax, [eax + 32]
  1009. vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
  1010. lea edx, [edx + 16]
  1011. sub ecx, 8
  1012. jg convertloop
  1013. vzeroupper
  1014. ret
  1015. }
  1016. }
  1017. #endif // HAS_ARGBTOARGB4444ROW_AVX2
  1018. // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
  1019. __declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
  1020. uint8_t* dst_y,
  1021. int width) {
  1022. __asm {
  1023. mov eax, [esp + 4] /* src_argb */
  1024. mov edx, [esp + 8] /* dst_y */
  1025. mov ecx, [esp + 12] /* width */
  1026. movdqa xmm4, xmmword ptr kARGBToY
  1027. movdqa xmm5, xmmword ptr kAddY16
  1028. convertloop:
  1029. movdqu xmm0, [eax]
  1030. movdqu xmm1, [eax + 16]
  1031. movdqu xmm2, [eax + 32]
  1032. movdqu xmm3, [eax + 48]
  1033. pmaddubsw xmm0, xmm4
  1034. pmaddubsw xmm1, xmm4
  1035. pmaddubsw xmm2, xmm4
  1036. pmaddubsw xmm3, xmm4
  1037. lea eax, [eax + 64]
  1038. phaddw xmm0, xmm1
  1039. phaddw xmm2, xmm3
  1040. psrlw xmm0, 7
  1041. psrlw xmm2, 7
  1042. packuswb xmm0, xmm2
  1043. paddb xmm0, xmm5
  1044. movdqu [edx], xmm0
  1045. lea edx, [edx + 16]
  1046. sub ecx, 16
  1047. jg convertloop
  1048. ret
  1049. }
  1050. }
  1051. // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
  1052. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
  1053. __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
  1054. uint8_t* dst_y,
  1055. int width) {
  1056. __asm {
  1057. mov eax, [esp + 4] /* src_argb */
  1058. mov edx, [esp + 8] /* dst_y */
  1059. mov ecx, [esp + 12] /* width */
  1060. movdqa xmm4, xmmword ptr kARGBToYJ
  1061. movdqa xmm5, xmmword ptr kAddYJ64
  1062. convertloop:
  1063. movdqu xmm0, [eax]
  1064. movdqu xmm1, [eax + 16]
  1065. movdqu xmm2, [eax + 32]
  1066. movdqu xmm3, [eax + 48]
  1067. pmaddubsw xmm0, xmm4
  1068. pmaddubsw xmm1, xmm4
  1069. pmaddubsw xmm2, xmm4
  1070. pmaddubsw xmm3, xmm4
  1071. lea eax, [eax + 64]
  1072. phaddw xmm0, xmm1
  1073. phaddw xmm2, xmm3
  1074. paddw xmm0, xmm5 // Add .5 for rounding.
  1075. paddw xmm2, xmm5
  1076. psrlw xmm0, 7
  1077. psrlw xmm2, 7
  1078. packuswb xmm0, xmm2
  1079. movdqu [edx], xmm0
  1080. lea edx, [edx + 16]
  1081. sub ecx, 16
  1082. jg convertloop
  1083. ret
  1084. }
  1085. }
  1086. #ifdef HAS_ARGBTOYROW_AVX2
  1087. // vpermd for vphaddw + vpackuswb vpermd.
  1088. static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
  1089. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  1090. __declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
  1091. uint8_t* dst_y,
  1092. int width) {
  1093. __asm {
  1094. mov eax, [esp + 4] /* src_argb */
  1095. mov edx, [esp + 8] /* dst_y */
  1096. mov ecx, [esp + 12] /* width */
  1097. vbroadcastf128 ymm4, xmmword ptr kARGBToY
  1098. vbroadcastf128 ymm5, xmmword ptr kAddY16
  1099. vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
  1100. convertloop:
  1101. vmovdqu ymm0, [eax]
  1102. vmovdqu ymm1, [eax + 32]
  1103. vmovdqu ymm2, [eax + 64]
  1104. vmovdqu ymm3, [eax + 96]
  1105. vpmaddubsw ymm0, ymm0, ymm4
  1106. vpmaddubsw ymm1, ymm1, ymm4
  1107. vpmaddubsw ymm2, ymm2, ymm4
  1108. vpmaddubsw ymm3, ymm3, ymm4
  1109. lea eax, [eax + 128]
  1110. vphaddw ymm0, ymm0, ymm1 // mutates.
  1111. vphaddw ymm2, ymm2, ymm3
  1112. vpsrlw ymm0, ymm0, 7
  1113. vpsrlw ymm2, ymm2, 7
  1114. vpackuswb ymm0, ymm0, ymm2 // mutates.
  1115. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
  1116. vpaddb ymm0, ymm0, ymm5 // add 16 for Y
  1117. vmovdqu [edx], ymm0
  1118. lea edx, [edx + 32]
  1119. sub ecx, 32
  1120. jg convertloop
  1121. vzeroupper
  1122. ret
  1123. }
  1124. }
  1125. #endif // HAS_ARGBTOYROW_AVX2
  1126. #ifdef HAS_ARGBTOYJROW_AVX2
  1127. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  1128. __declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
  1129. uint8_t* dst_y,
  1130. int width) {
  1131. __asm {
  1132. mov eax, [esp + 4] /* src_argb */
  1133. mov edx, [esp + 8] /* dst_y */
  1134. mov ecx, [esp + 12] /* width */
  1135. vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
  1136. vbroadcastf128 ymm5, xmmword ptr kAddYJ64
  1137. vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
  1138. convertloop:
  1139. vmovdqu ymm0, [eax]
  1140. vmovdqu ymm1, [eax + 32]
  1141. vmovdqu ymm2, [eax + 64]
  1142. vmovdqu ymm3, [eax + 96]
  1143. vpmaddubsw ymm0, ymm0, ymm4
  1144. vpmaddubsw ymm1, ymm1, ymm4
  1145. vpmaddubsw ymm2, ymm2, ymm4
  1146. vpmaddubsw ymm3, ymm3, ymm4
  1147. lea eax, [eax + 128]
  1148. vphaddw ymm0, ymm0, ymm1 // mutates.
  1149. vphaddw ymm2, ymm2, ymm3
  1150. vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
  1151. vpaddw ymm2, ymm2, ymm5
  1152. vpsrlw ymm0, ymm0, 7
  1153. vpsrlw ymm2, ymm2, 7
  1154. vpackuswb ymm0, ymm0, ymm2 // mutates.
  1155. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
  1156. vmovdqu [edx], ymm0
  1157. lea edx, [edx + 32]
  1158. sub ecx, 32
  1159. jg convertloop
  1160. vzeroupper
  1161. ret
  1162. }
  1163. }
  1164. #endif // HAS_ARGBTOYJROW_AVX2
  1165. __declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
  1166. uint8_t* dst_y,
  1167. int width) {
  1168. __asm {
  1169. mov eax, [esp + 4] /* src_argb */
  1170. mov edx, [esp + 8] /* dst_y */
  1171. mov ecx, [esp + 12] /* width */
  1172. movdqa xmm4, xmmword ptr kBGRAToY
  1173. movdqa xmm5, xmmword ptr kAddY16
  1174. convertloop:
  1175. movdqu xmm0, [eax]
  1176. movdqu xmm1, [eax + 16]
  1177. movdqu xmm2, [eax + 32]
  1178. movdqu xmm3, [eax + 48]
  1179. pmaddubsw xmm0, xmm4
  1180. pmaddubsw xmm1, xmm4
  1181. pmaddubsw xmm2, xmm4
  1182. pmaddubsw xmm3, xmm4
  1183. lea eax, [eax + 64]
  1184. phaddw xmm0, xmm1
  1185. phaddw xmm2, xmm3
  1186. psrlw xmm0, 7
  1187. psrlw xmm2, 7
  1188. packuswb xmm0, xmm2
  1189. paddb xmm0, xmm5
  1190. movdqu [edx], xmm0
  1191. lea edx, [edx + 16]
  1192. sub ecx, 16
  1193. jg convertloop
  1194. ret
  1195. }
  1196. }
  1197. __declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
  1198. uint8_t* dst_y,
  1199. int width) {
  1200. __asm {
  1201. mov eax, [esp + 4] /* src_argb */
  1202. mov edx, [esp + 8] /* dst_y */
  1203. mov ecx, [esp + 12] /* width */
  1204. movdqa xmm4, xmmword ptr kABGRToY
  1205. movdqa xmm5, xmmword ptr kAddY16
  1206. convertloop:
  1207. movdqu xmm0, [eax]
  1208. movdqu xmm1, [eax + 16]
  1209. movdqu xmm2, [eax + 32]
  1210. movdqu xmm3, [eax + 48]
  1211. pmaddubsw xmm0, xmm4
  1212. pmaddubsw xmm1, xmm4
  1213. pmaddubsw xmm2, xmm4
  1214. pmaddubsw xmm3, xmm4
  1215. lea eax, [eax + 64]
  1216. phaddw xmm0, xmm1
  1217. phaddw xmm2, xmm3
  1218. psrlw xmm0, 7
  1219. psrlw xmm2, 7
  1220. packuswb xmm0, xmm2
  1221. paddb xmm0, xmm5
  1222. movdqu [edx], xmm0
  1223. lea edx, [edx + 16]
  1224. sub ecx, 16
  1225. jg convertloop
  1226. ret
  1227. }
  1228. }
  1229. __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
  1230. uint8_t* dst_y,
  1231. int width) {
  1232. __asm {
  1233. mov eax, [esp + 4] /* src_argb */
  1234. mov edx, [esp + 8] /* dst_y */
  1235. mov ecx, [esp + 12] /* width */
  1236. movdqa xmm4, xmmword ptr kRGBAToY
  1237. movdqa xmm5, xmmword ptr kAddY16
  1238. convertloop:
  1239. movdqu xmm0, [eax]
  1240. movdqu xmm1, [eax + 16]
  1241. movdqu xmm2, [eax + 32]
  1242. movdqu xmm3, [eax + 48]
  1243. pmaddubsw xmm0, xmm4
  1244. pmaddubsw xmm1, xmm4
  1245. pmaddubsw xmm2, xmm4
  1246. pmaddubsw xmm3, xmm4
  1247. lea eax, [eax + 64]
  1248. phaddw xmm0, xmm1
  1249. phaddw xmm2, xmm3
  1250. psrlw xmm0, 7
  1251. psrlw xmm2, 7
  1252. packuswb xmm0, xmm2
  1253. paddb xmm0, xmm5
  1254. movdqu [edx], xmm0
  1255. lea edx, [edx + 16]
  1256. sub ecx, 16
  1257. jg convertloop
  1258. ret
  1259. }
  1260. }
  1261. __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
  1262. int src_stride_argb,
  1263. uint8_t* dst_u,
  1264. uint8_t* dst_v,
  1265. int width) {
  1266. __asm {
  1267. push esi
  1268. push edi
  1269. mov eax, [esp + 8 + 4] // src_argb
  1270. mov esi, [esp + 8 + 8] // src_stride_argb
  1271. mov edx, [esp + 8 + 12] // dst_u
  1272. mov edi, [esp + 8 + 16] // dst_v
  1273. mov ecx, [esp + 8 + 20] // width
  1274. movdqa xmm5, xmmword ptr kAddUV128
  1275. movdqa xmm6, xmmword ptr kARGBToV
  1276. movdqa xmm7, xmmword ptr kARGBToU
  1277. sub edi, edx // stride from u to v
  1278. convertloop:
  1279. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1280. movdqu xmm0, [eax]
  1281. movdqu xmm4, [eax + esi]
  1282. pavgb xmm0, xmm4
  1283. movdqu xmm1, [eax + 16]
  1284. movdqu xmm4, [eax + esi + 16]
  1285. pavgb xmm1, xmm4
  1286. movdqu xmm2, [eax + 32]
  1287. movdqu xmm4, [eax + esi + 32]
  1288. pavgb xmm2, xmm4
  1289. movdqu xmm3, [eax + 48]
  1290. movdqu xmm4, [eax + esi + 48]
  1291. pavgb xmm3, xmm4
  1292. lea eax, [eax + 64]
  1293. movdqa xmm4, xmm0
  1294. shufps xmm0, xmm1, 0x88
  1295. shufps xmm4, xmm1, 0xdd
  1296. pavgb xmm0, xmm4
  1297. movdqa xmm4, xmm2
  1298. shufps xmm2, xmm3, 0x88
  1299. shufps xmm4, xmm3, 0xdd
  1300. pavgb xmm2, xmm4
  1301. // step 2 - convert to U and V
  1302. // from here down is very similar to Y code except
  1303. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1304. movdqa xmm1, xmm0
  1305. movdqa xmm3, xmm2
  1306. pmaddubsw xmm0, xmm7 // U
  1307. pmaddubsw xmm2, xmm7
  1308. pmaddubsw xmm1, xmm6 // V
  1309. pmaddubsw xmm3, xmm6
  1310. phaddw xmm0, xmm2
  1311. phaddw xmm1, xmm3
  1312. psraw xmm0, 8
  1313. psraw xmm1, 8
  1314. packsswb xmm0, xmm1
  1315. paddb xmm0, xmm5 // -> unsigned
  1316. // step 3 - store 8 U and 8 V values
  1317. movlps qword ptr [edx], xmm0 // U
  1318. movhps qword ptr [edx + edi], xmm0 // V
  1319. lea edx, [edx + 8]
  1320. sub ecx, 16
  1321. jg convertloop
  1322. pop edi
  1323. pop esi
  1324. ret
  1325. }
  1326. }
  1327. __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
  1328. int src_stride_argb,
  1329. uint8_t* dst_u,
  1330. uint8_t* dst_v,
  1331. int width) {
  1332. __asm {
  1333. push esi
  1334. push edi
  1335. mov eax, [esp + 8 + 4] // src_argb
  1336. mov esi, [esp + 8 + 8] // src_stride_argb
  1337. mov edx, [esp + 8 + 12] // dst_u
  1338. mov edi, [esp + 8 + 16] // dst_v
  1339. mov ecx, [esp + 8 + 20] // width
  1340. movdqa xmm5, xmmword ptr kAddUVJ128
  1341. movdqa xmm6, xmmword ptr kARGBToVJ
  1342. movdqa xmm7, xmmword ptr kARGBToUJ
  1343. sub edi, edx // stride from u to v
  1344. convertloop:
  1345. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1346. movdqu xmm0, [eax]
  1347. movdqu xmm4, [eax + esi]
  1348. pavgb xmm0, xmm4
  1349. movdqu xmm1, [eax + 16]
  1350. movdqu xmm4, [eax + esi + 16]
  1351. pavgb xmm1, xmm4
  1352. movdqu xmm2, [eax + 32]
  1353. movdqu xmm4, [eax + esi + 32]
  1354. pavgb xmm2, xmm4
  1355. movdqu xmm3, [eax + 48]
  1356. movdqu xmm4, [eax + esi + 48]
  1357. pavgb xmm3, xmm4
  1358. lea eax, [eax + 64]
  1359. movdqa xmm4, xmm0
  1360. shufps xmm0, xmm1, 0x88
  1361. shufps xmm4, xmm1, 0xdd
  1362. pavgb xmm0, xmm4
  1363. movdqa xmm4, xmm2
  1364. shufps xmm2, xmm3, 0x88
  1365. shufps xmm4, xmm3, 0xdd
  1366. pavgb xmm2, xmm4
  1367. // step 2 - convert to U and V
  1368. // from here down is very similar to Y code except
  1369. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1370. movdqa xmm1, xmm0
  1371. movdqa xmm3, xmm2
  1372. pmaddubsw xmm0, xmm7 // U
  1373. pmaddubsw xmm2, xmm7
  1374. pmaddubsw xmm1, xmm6 // V
  1375. pmaddubsw xmm3, xmm6
  1376. phaddw xmm0, xmm2
  1377. phaddw xmm1, xmm3
  1378. paddw xmm0, xmm5 // +.5 rounding -> unsigned
  1379. paddw xmm1, xmm5
  1380. psraw xmm0, 8
  1381. psraw xmm1, 8
  1382. packsswb xmm0, xmm1
  1383. // step 3 - store 8 U and 8 V values
  1384. movlps qword ptr [edx], xmm0 // U
  1385. movhps qword ptr [edx + edi], xmm0 // V
  1386. lea edx, [edx + 8]
  1387. sub ecx, 16
  1388. jg convertloop
  1389. pop edi
  1390. pop esi
  1391. ret
  1392. }
  1393. }
  1394. #ifdef HAS_ARGBTOUVROW_AVX2
  1395. __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
  1396. int src_stride_argb,
  1397. uint8_t* dst_u,
  1398. uint8_t* dst_v,
  1399. int width) {
  1400. __asm {
  1401. push esi
  1402. push edi
  1403. mov eax, [esp + 8 + 4] // src_argb
  1404. mov esi, [esp + 8 + 8] // src_stride_argb
  1405. mov edx, [esp + 8 + 12] // dst_u
  1406. mov edi, [esp + 8 + 16] // dst_v
  1407. mov ecx, [esp + 8 + 20] // width
  1408. vbroadcastf128 ymm5, xmmword ptr kAddUV128
  1409. vbroadcastf128 ymm6, xmmword ptr kARGBToV
  1410. vbroadcastf128 ymm7, xmmword ptr kARGBToU
  1411. sub edi, edx // stride from u to v
  1412. convertloop:
  1413. /* step 1 - subsample 32x2 argb pixels to 16x1 */
  1414. vmovdqu ymm0, [eax]
  1415. vmovdqu ymm1, [eax + 32]
  1416. vmovdqu ymm2, [eax + 64]
  1417. vmovdqu ymm3, [eax + 96]
  1418. vpavgb ymm0, ymm0, [eax + esi]
  1419. vpavgb ymm1, ymm1, [eax + esi + 32]
  1420. vpavgb ymm2, ymm2, [eax + esi + 64]
  1421. vpavgb ymm3, ymm3, [eax + esi + 96]
  1422. lea eax, [eax + 128]
  1423. vshufps ymm4, ymm0, ymm1, 0x88
  1424. vshufps ymm0, ymm0, ymm1, 0xdd
  1425. vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
  1426. vshufps ymm4, ymm2, ymm3, 0x88
  1427. vshufps ymm2, ymm2, ymm3, 0xdd
  1428. vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
  1429. // step 2 - convert to U and V
  1430. // from here down is very similar to Y code except
  1431. // instead of 32 different pixels, its 16 pixels of U and 16 of V
  1432. vpmaddubsw ymm1, ymm0, ymm7 // U
  1433. vpmaddubsw ymm3, ymm2, ymm7
  1434. vpmaddubsw ymm0, ymm0, ymm6 // V
  1435. vpmaddubsw ymm2, ymm2, ymm6
  1436. vphaddw ymm1, ymm1, ymm3 // mutates
  1437. vphaddw ymm0, ymm0, ymm2
  1438. vpsraw ymm1, ymm1, 8
  1439. vpsraw ymm0, ymm0, 8
  1440. vpacksswb ymm0, ymm1, ymm0 // mutates
  1441. vpermq ymm0, ymm0, 0xd8 // For vpacksswb
  1442. vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
  1443. vpaddb ymm0, ymm0, ymm5 // -> unsigned
  1444. // step 3 - store 16 U and 16 V values
  1445. vextractf128 [edx], ymm0, 0 // U
  1446. vextractf128 [edx + edi], ymm0, 1 // V
  1447. lea edx, [edx + 16]
  1448. sub ecx, 32
  1449. jg convertloop
  1450. pop edi
  1451. pop esi
  1452. vzeroupper
  1453. ret
  1454. }
  1455. }
  1456. #endif // HAS_ARGBTOUVROW_AVX2
  1457. #ifdef HAS_ARGBTOUVJROW_AVX2
  1458. __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
  1459. int src_stride_argb,
  1460. uint8_t* dst_u,
  1461. uint8_t* dst_v,
  1462. int width) {
  1463. __asm {
  1464. push esi
  1465. push edi
  1466. mov eax, [esp + 8 + 4] // src_argb
  1467. mov esi, [esp + 8 + 8] // src_stride_argb
  1468. mov edx, [esp + 8 + 12] // dst_u
  1469. mov edi, [esp + 8 + 16] // dst_v
  1470. mov ecx, [esp + 8 + 20] // width
  1471. vbroadcastf128 ymm5, xmmword ptr kAddUVJ128
  1472. vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
  1473. vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
  1474. sub edi, edx // stride from u to v
  1475. convertloop:
  1476. /* step 1 - subsample 32x2 argb pixels to 16x1 */
  1477. vmovdqu ymm0, [eax]
  1478. vmovdqu ymm1, [eax + 32]
  1479. vmovdqu ymm2, [eax + 64]
  1480. vmovdqu ymm3, [eax + 96]
  1481. vpavgb ymm0, ymm0, [eax + esi]
  1482. vpavgb ymm1, ymm1, [eax + esi + 32]
  1483. vpavgb ymm2, ymm2, [eax + esi + 64]
  1484. vpavgb ymm3, ymm3, [eax + esi + 96]
  1485. lea eax, [eax + 128]
  1486. vshufps ymm4, ymm0, ymm1, 0x88
  1487. vshufps ymm0, ymm0, ymm1, 0xdd
  1488. vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
  1489. vshufps ymm4, ymm2, ymm3, 0x88
  1490. vshufps ymm2, ymm2, ymm3, 0xdd
  1491. vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
  1492. // step 2 - convert to U and V
  1493. // from here down is very similar to Y code except
  1494. // instead of 32 different pixels, its 16 pixels of U and 16 of V
  1495. vpmaddubsw ymm1, ymm0, ymm7 // U
  1496. vpmaddubsw ymm3, ymm2, ymm7
  1497. vpmaddubsw ymm0, ymm0, ymm6 // V
  1498. vpmaddubsw ymm2, ymm2, ymm6
  1499. vphaddw ymm1, ymm1, ymm3 // mutates
  1500. vphaddw ymm0, ymm0, ymm2
  1501. vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
  1502. vpaddw ymm0, ymm0, ymm5
  1503. vpsraw ymm1, ymm1, 8
  1504. vpsraw ymm0, ymm0, 8
  1505. vpacksswb ymm0, ymm1, ymm0 // mutates
  1506. vpermq ymm0, ymm0, 0xd8 // For vpacksswb
  1507. vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
  1508. // step 3 - store 16 U and 16 V values
  1509. vextractf128 [edx], ymm0, 0 // U
  1510. vextractf128 [edx + edi], ymm0, 1 // V
  1511. lea edx, [edx + 16]
  1512. sub ecx, 32
  1513. jg convertloop
  1514. pop edi
  1515. pop esi
  1516. vzeroupper
  1517. ret
  1518. }
  1519. }
  1520. #endif // HAS_ARGBTOUVJROW_AVX2
  1521. __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
  1522. uint8_t* dst_u,
  1523. uint8_t* dst_v,
  1524. int width) {
  1525. __asm {
  1526. push edi
  1527. mov eax, [esp + 4 + 4] // src_argb
  1528. mov edx, [esp + 4 + 8] // dst_u
  1529. mov edi, [esp + 4 + 12] // dst_v
  1530. mov ecx, [esp + 4 + 16] // width
  1531. movdqa xmm5, xmmword ptr kAddUV128
  1532. movdqa xmm6, xmmword ptr kARGBToV
  1533. movdqa xmm7, xmmword ptr kARGBToU
  1534. sub edi, edx // stride from u to v
  1535. convertloop:
  1536. /* convert to U and V */
  1537. movdqu xmm0, [eax] // U
  1538. movdqu xmm1, [eax + 16]
  1539. movdqu xmm2, [eax + 32]
  1540. movdqu xmm3, [eax + 48]
  1541. pmaddubsw xmm0, xmm7
  1542. pmaddubsw xmm1, xmm7
  1543. pmaddubsw xmm2, xmm7
  1544. pmaddubsw xmm3, xmm7
  1545. phaddw xmm0, xmm1
  1546. phaddw xmm2, xmm3
  1547. psraw xmm0, 8
  1548. psraw xmm2, 8
  1549. packsswb xmm0, xmm2
  1550. paddb xmm0, xmm5
  1551. movdqu [edx], xmm0
  1552. movdqu xmm0, [eax] // V
  1553. movdqu xmm1, [eax + 16]
  1554. movdqu xmm2, [eax + 32]
  1555. movdqu xmm3, [eax + 48]
  1556. pmaddubsw xmm0, xmm6
  1557. pmaddubsw xmm1, xmm6
  1558. pmaddubsw xmm2, xmm6
  1559. pmaddubsw xmm3, xmm6
  1560. phaddw xmm0, xmm1
  1561. phaddw xmm2, xmm3
  1562. psraw xmm0, 8
  1563. psraw xmm2, 8
  1564. packsswb xmm0, xmm2
  1565. paddb xmm0, xmm5
  1566. lea eax, [eax + 64]
  1567. movdqu [edx + edi], xmm0
  1568. lea edx, [edx + 16]
  1569. sub ecx, 16
  1570. jg convertloop
  1571. pop edi
  1572. ret
  1573. }
  1574. }
  1575. __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
  1576. int src_stride_argb,
  1577. uint8_t* dst_u,
  1578. uint8_t* dst_v,
  1579. int width) {
  1580. __asm {
  1581. push esi
  1582. push edi
  1583. mov eax, [esp + 8 + 4] // src_argb
  1584. mov esi, [esp + 8 + 8] // src_stride_argb
  1585. mov edx, [esp + 8 + 12] // dst_u
  1586. mov edi, [esp + 8 + 16] // dst_v
  1587. mov ecx, [esp + 8 + 20] // width
  1588. movdqa xmm5, xmmword ptr kAddUV128
  1589. movdqa xmm6, xmmword ptr kBGRAToV
  1590. movdqa xmm7, xmmword ptr kBGRAToU
  1591. sub edi, edx // stride from u to v
  1592. convertloop:
  1593. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1594. movdqu xmm0, [eax]
  1595. movdqu xmm4, [eax + esi]
  1596. pavgb xmm0, xmm4
  1597. movdqu xmm1, [eax + 16]
  1598. movdqu xmm4, [eax + esi + 16]
  1599. pavgb xmm1, xmm4
  1600. movdqu xmm2, [eax + 32]
  1601. movdqu xmm4, [eax + esi + 32]
  1602. pavgb xmm2, xmm4
  1603. movdqu xmm3, [eax + 48]
  1604. movdqu xmm4, [eax + esi + 48]
  1605. pavgb xmm3, xmm4
  1606. lea eax, [eax + 64]
  1607. movdqa xmm4, xmm0
  1608. shufps xmm0, xmm1, 0x88
  1609. shufps xmm4, xmm1, 0xdd
  1610. pavgb xmm0, xmm4
  1611. movdqa xmm4, xmm2
  1612. shufps xmm2, xmm3, 0x88
  1613. shufps xmm4, xmm3, 0xdd
  1614. pavgb xmm2, xmm4
  1615. // step 2 - convert to U and V
  1616. // from here down is very similar to Y code except
  1617. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1618. movdqa xmm1, xmm0
  1619. movdqa xmm3, xmm2
  1620. pmaddubsw xmm0, xmm7 // U
  1621. pmaddubsw xmm2, xmm7
  1622. pmaddubsw xmm1, xmm6 // V
  1623. pmaddubsw xmm3, xmm6
  1624. phaddw xmm0, xmm2
  1625. phaddw xmm1, xmm3
  1626. psraw xmm0, 8
  1627. psraw xmm1, 8
  1628. packsswb xmm0, xmm1
  1629. paddb xmm0, xmm5 // -> unsigned
  1630. // step 3 - store 8 U and 8 V values
  1631. movlps qword ptr [edx], xmm0 // U
  1632. movhps qword ptr [edx + edi], xmm0 // V
  1633. lea edx, [edx + 8]
  1634. sub ecx, 16
  1635. jg convertloop
  1636. pop edi
  1637. pop esi
  1638. ret
  1639. }
  1640. }
  1641. __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
  1642. int src_stride_argb,
  1643. uint8_t* dst_u,
  1644. uint8_t* dst_v,
  1645. int width) {
  1646. __asm {
  1647. push esi
  1648. push edi
  1649. mov eax, [esp + 8 + 4] // src_argb
  1650. mov esi, [esp + 8 + 8] // src_stride_argb
  1651. mov edx, [esp + 8 + 12] // dst_u
  1652. mov edi, [esp + 8 + 16] // dst_v
  1653. mov ecx, [esp + 8 + 20] // width
  1654. movdqa xmm5, xmmword ptr kAddUV128
  1655. movdqa xmm6, xmmword ptr kABGRToV
  1656. movdqa xmm7, xmmword ptr kABGRToU
  1657. sub edi, edx // stride from u to v
  1658. convertloop:
  1659. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1660. movdqu xmm0, [eax]
  1661. movdqu xmm4, [eax + esi]
  1662. pavgb xmm0, xmm4
  1663. movdqu xmm1, [eax + 16]
  1664. movdqu xmm4, [eax + esi + 16]
  1665. pavgb xmm1, xmm4
  1666. movdqu xmm2, [eax + 32]
  1667. movdqu xmm4, [eax + esi + 32]
  1668. pavgb xmm2, xmm4
  1669. movdqu xmm3, [eax + 48]
  1670. movdqu xmm4, [eax + esi + 48]
  1671. pavgb xmm3, xmm4
  1672. lea eax, [eax + 64]
  1673. movdqa xmm4, xmm0
  1674. shufps xmm0, xmm1, 0x88
  1675. shufps xmm4, xmm1, 0xdd
  1676. pavgb xmm0, xmm4
  1677. movdqa xmm4, xmm2
  1678. shufps xmm2, xmm3, 0x88
  1679. shufps xmm4, xmm3, 0xdd
  1680. pavgb xmm2, xmm4
  1681. // step 2 - convert to U and V
  1682. // from here down is very similar to Y code except
  1683. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1684. movdqa xmm1, xmm0
  1685. movdqa xmm3, xmm2
  1686. pmaddubsw xmm0, xmm7 // U
  1687. pmaddubsw xmm2, xmm7
  1688. pmaddubsw xmm1, xmm6 // V
  1689. pmaddubsw xmm3, xmm6
  1690. phaddw xmm0, xmm2
  1691. phaddw xmm1, xmm3
  1692. psraw xmm0, 8
  1693. psraw xmm1, 8
  1694. packsswb xmm0, xmm1
  1695. paddb xmm0, xmm5 // -> unsigned
  1696. // step 3 - store 8 U and 8 V values
  1697. movlps qword ptr [edx], xmm0 // U
  1698. movhps qword ptr [edx + edi], xmm0 // V
  1699. lea edx, [edx + 8]
  1700. sub ecx, 16
  1701. jg convertloop
  1702. pop edi
  1703. pop esi
  1704. ret
  1705. }
  1706. }
  1707. __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
  1708. int src_stride_argb,
  1709. uint8_t* dst_u,
  1710. uint8_t* dst_v,
  1711. int width) {
  1712. __asm {
  1713. push esi
  1714. push edi
  1715. mov eax, [esp + 8 + 4] // src_argb
  1716. mov esi, [esp + 8 + 8] // src_stride_argb
  1717. mov edx, [esp + 8 + 12] // dst_u
  1718. mov edi, [esp + 8 + 16] // dst_v
  1719. mov ecx, [esp + 8 + 20] // width
  1720. movdqa xmm5, xmmword ptr kAddUV128
  1721. movdqa xmm6, xmmword ptr kRGBAToV
  1722. movdqa xmm7, xmmword ptr kRGBAToU
  1723. sub edi, edx // stride from u to v
  1724. convertloop:
  1725. /* step 1 - subsample 16x2 argb pixels to 8x1 */
  1726. movdqu xmm0, [eax]
  1727. movdqu xmm4, [eax + esi]
  1728. pavgb xmm0, xmm4
  1729. movdqu xmm1, [eax + 16]
  1730. movdqu xmm4, [eax + esi + 16]
  1731. pavgb xmm1, xmm4
  1732. movdqu xmm2, [eax + 32]
  1733. movdqu xmm4, [eax + esi + 32]
  1734. pavgb xmm2, xmm4
  1735. movdqu xmm3, [eax + 48]
  1736. movdqu xmm4, [eax + esi + 48]
  1737. pavgb xmm3, xmm4
  1738. lea eax, [eax + 64]
  1739. movdqa xmm4, xmm0
  1740. shufps xmm0, xmm1, 0x88
  1741. shufps xmm4, xmm1, 0xdd
  1742. pavgb xmm0, xmm4
  1743. movdqa xmm4, xmm2
  1744. shufps xmm2, xmm3, 0x88
  1745. shufps xmm4, xmm3, 0xdd
  1746. pavgb xmm2, xmm4
  1747. // step 2 - convert to U and V
  1748. // from here down is very similar to Y code except
  1749. // instead of 16 different pixels, its 8 pixels of U and 8 of V
  1750. movdqa xmm1, xmm0
  1751. movdqa xmm3, xmm2
  1752. pmaddubsw xmm0, xmm7 // U
  1753. pmaddubsw xmm2, xmm7
  1754. pmaddubsw xmm1, xmm6 // V
  1755. pmaddubsw xmm3, xmm6
  1756. phaddw xmm0, xmm2
  1757. phaddw xmm1, xmm3
  1758. psraw xmm0, 8
  1759. psraw xmm1, 8
  1760. packsswb xmm0, xmm1
  1761. paddb xmm0, xmm5 // -> unsigned
  1762. // step 3 - store 8 U and 8 V values
  1763. movlps qword ptr [edx], xmm0 // U
  1764. movhps qword ptr [edx + edi], xmm0 // V
  1765. lea edx, [edx + 8]
  1766. sub ecx, 16
  1767. jg convertloop
  1768. pop edi
  1769. pop esi
  1770. ret
  1771. }
  1772. }
  1773. #endif // HAS_ARGBTOYROW_SSSE3
  1774. // Read 16 UV from 444
  1775. #define READYUV444_AVX2 \
  1776. __asm { \
  1777. __asm vmovdqu xmm0, [esi] /* U */ \
  1778. __asm vmovdqu xmm1, [esi + edi] /* V */ \
  1779. __asm lea esi, [esi + 16] \
  1780. __asm vpermq ymm0, ymm0, 0xd8 \
  1781. __asm vpermq ymm1, ymm1, 0xd8 \
  1782. __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
  1783. __asm vmovdqu xmm4, [eax] /* Y */ \
  1784. __asm vpermq ymm4, ymm4, 0xd8 \
  1785. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1786. __asm lea eax, [eax + 16]}
  1787. // Read 8 UV from 422, upsample to 16 UV.
  1788. #define READYUV422_AVX2 \
  1789. __asm { \
  1790. __asm vmovq xmm0, qword ptr [esi] /* U */ \
  1791. __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
  1792. __asm lea esi, [esi + 8] \
  1793. __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
  1794. __asm vpermq ymm0, ymm0, 0xd8 \
  1795. __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
  1796. __asm vmovdqu xmm4, [eax] /* Y */ \
  1797. __asm vpermq ymm4, ymm4, 0xd8 \
  1798. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1799. __asm lea eax, [eax + 16]}
  1800. // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
  1801. #define READYUVA422_AVX2 \
  1802. __asm { \
  1803. __asm vmovq xmm0, qword ptr [esi] /* U */ \
  1804. __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
  1805. __asm lea esi, [esi + 8] \
  1806. __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
  1807. __asm vpermq ymm0, ymm0, 0xd8 \
  1808. __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
  1809. __asm vmovdqu xmm4, [eax] /* Y */ \
  1810. __asm vpermq ymm4, ymm4, 0xd8 \
  1811. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1812. __asm lea eax, [eax + 16] \
  1813. __asm vmovdqu xmm5, [ebp] /* A */ \
  1814. __asm vpermq ymm5, ymm5, 0xd8 \
  1815. __asm lea ebp, [ebp + 16]}
  1816. // Read 8 UV from NV12, upsample to 16 UV.
  1817. #define READNV12_AVX2 \
  1818. __asm { \
  1819. __asm vmovdqu xmm0, [esi] /* UV */ \
  1820. __asm lea esi, [esi + 16] \
  1821. __asm vpermq ymm0, ymm0, 0xd8 \
  1822. __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
  1823. __asm vmovdqu xmm4, [eax] /* Y */ \
  1824. __asm vpermq ymm4, ymm4, 0xd8 \
  1825. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1826. __asm lea eax, [eax + 16]}
  1827. // Read 8 UV from NV21, upsample to 16 UV.
  1828. #define READNV21_AVX2 \
  1829. __asm { \
  1830. __asm vmovdqu xmm0, [esi] /* UV */ \
  1831. __asm lea esi, [esi + 16] \
  1832. __asm vpermq ymm0, ymm0, 0xd8 \
  1833. __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
  1834. __asm vmovdqu xmm4, [eax] /* Y */ \
  1835. __asm vpermq ymm4, ymm4, 0xd8 \
  1836. __asm vpunpcklbw ymm4, ymm4, ymm4 \
  1837. __asm lea eax, [eax + 16]}
  1838. // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
  1839. #define READYUY2_AVX2 \
  1840. __asm { \
  1841. __asm vmovdqu ymm4, [eax] /* YUY2 */ \
  1842. __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
  1843. __asm vmovdqu ymm0, [eax] /* UV */ \
  1844. __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
  1845. __asm lea eax, [eax + 32]}
  1846. // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
  1847. #define READUYVY_AVX2 \
  1848. __asm { \
  1849. __asm vmovdqu ymm4, [eax] /* UYVY */ \
  1850. __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
  1851. __asm vmovdqu ymm0, [eax] /* UV */ \
  1852. __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
  1853. __asm lea eax, [eax + 32]}
  1854. // Convert 16 pixels: 16 UV and 16 Y.
  1855. #define YUVTORGB_AVX2(YuvConstants) \
  1856. __asm { \
  1857. __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
  1858. __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
  1859. __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
  1860. __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
  1861. __asm vpsubw ymm2, ymm3, ymm2 \
  1862. __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
  1863. __asm vpsubw ymm1, ymm3, ymm1 \
  1864. __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
  1865. __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \
  1866. __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
  1867. __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
  1868. __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
  1869. __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
  1870. __asm vpsraw ymm0, ymm0, 6 \
  1871. __asm vpsraw ymm1, ymm1, 6 \
  1872. __asm vpsraw ymm2, ymm2, 6 \
  1873. __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
  1874. __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
  1875. __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
  1876. }
  1877. // Store 16 ARGB values.
  1878. #define STOREARGB_AVX2 \
  1879. __asm { \
  1880. __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
  1881. __asm vpermq ymm0, ymm0, 0xd8 \
  1882. __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
  1883. __asm vpermq ymm2, ymm2, 0xd8 \
  1884. __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
  1885. __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
  1886. __asm vmovdqu 0[edx], ymm1 \
  1887. __asm vmovdqu 32[edx], ymm0 \
  1888. __asm lea edx, [edx + 64]}
  1889. // Store 16 RGBA values.
  1890. #define STORERGBA_AVX2 \
  1891. __asm { \
  1892. __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
  1893. __asm vpermq ymm1, ymm1, 0xd8 \
  1894. __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
  1895. __asm vpermq ymm2, ymm2, 0xd8 \
  1896. __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
  1897. __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
  1898. __asm vmovdqu [edx], ymm0 \
  1899. __asm vmovdqu [edx + 32], ymm1 \
  1900. __asm lea edx, [edx + 64]}
  1901. #ifdef HAS_I422TOARGBROW_AVX2
  1902. // 16 pixels
  1903. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  1904. __declspec(naked) void I422ToARGBRow_AVX2(
  1905. const uint8_t* y_buf,
  1906. const uint8_t* u_buf,
  1907. const uint8_t* v_buf,
  1908. uint8_t* dst_argb,
  1909. const struct YuvConstants* yuvconstants,
  1910. int width) {
  1911. __asm {
  1912. push esi
  1913. push edi
  1914. push ebx
  1915. mov eax, [esp + 12 + 4] // Y
  1916. mov esi, [esp + 12 + 8] // U
  1917. mov edi, [esp + 12 + 12] // V
  1918. mov edx, [esp + 12 + 16] // argb
  1919. mov ebx, [esp + 12 + 20] // yuvconstants
  1920. mov ecx, [esp + 12 + 24] // width
  1921. sub edi, esi
  1922. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  1923. convertloop:
  1924. READYUV422_AVX2
  1925. YUVTORGB_AVX2(ebx)
  1926. STOREARGB_AVX2
  1927. sub ecx, 16
  1928. jg convertloop
  1929. pop ebx
  1930. pop edi
  1931. pop esi
  1932. vzeroupper
  1933. ret
  1934. }
  1935. }
  1936. #endif // HAS_I422TOARGBROW_AVX2
  1937. #ifdef HAS_I422ALPHATOARGBROW_AVX2
  1938. // 16 pixels
  1939. // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
  1940. __declspec(naked) void I422AlphaToARGBRow_AVX2(
  1941. const uint8_t* y_buf,
  1942. const uint8_t* u_buf,
  1943. const uint8_t* v_buf,
  1944. const uint8_t* a_buf,
  1945. uint8_t* dst_argb,
  1946. const struct YuvConstants* yuvconstants,
  1947. int width) {
  1948. __asm {
  1949. push esi
  1950. push edi
  1951. push ebx
  1952. push ebp
  1953. mov eax, [esp + 16 + 4] // Y
  1954. mov esi, [esp + 16 + 8] // U
  1955. mov edi, [esp + 16 + 12] // V
  1956. mov ebp, [esp + 16 + 16] // A
  1957. mov edx, [esp + 16 + 20] // argb
  1958. mov ebx, [esp + 16 + 24] // yuvconstants
  1959. mov ecx, [esp + 16 + 28] // width
  1960. sub edi, esi
  1961. convertloop:
  1962. READYUVA422_AVX2
  1963. YUVTORGB_AVX2(ebx)
  1964. STOREARGB_AVX2
  1965. sub ecx, 16
  1966. jg convertloop
  1967. pop ebp
  1968. pop ebx
  1969. pop edi
  1970. pop esi
  1971. vzeroupper
  1972. ret
  1973. }
  1974. }
  1975. #endif // HAS_I422ALPHATOARGBROW_AVX2
  1976. #ifdef HAS_I444TOARGBROW_AVX2
  1977. // 16 pixels
  1978. // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
  1979. __declspec(naked) void I444ToARGBRow_AVX2(
  1980. const uint8_t* y_buf,
  1981. const uint8_t* u_buf,
  1982. const uint8_t* v_buf,
  1983. uint8_t* dst_argb,
  1984. const struct YuvConstants* yuvconstants,
  1985. int width) {
  1986. __asm {
  1987. push esi
  1988. push edi
  1989. push ebx
  1990. mov eax, [esp + 12 + 4] // Y
  1991. mov esi, [esp + 12 + 8] // U
  1992. mov edi, [esp + 12 + 12] // V
  1993. mov edx, [esp + 12 + 16] // argb
  1994. mov ebx, [esp + 12 + 20] // yuvconstants
  1995. mov ecx, [esp + 12 + 24] // width
  1996. sub edi, esi
  1997. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  1998. convertloop:
  1999. READYUV444_AVX2
  2000. YUVTORGB_AVX2(ebx)
  2001. STOREARGB_AVX2
  2002. sub ecx, 16
  2003. jg convertloop
  2004. pop ebx
  2005. pop edi
  2006. pop esi
  2007. vzeroupper
  2008. ret
  2009. }
  2010. }
  2011. #endif // HAS_I444TOARGBROW_AVX2
  2012. #ifdef HAS_NV12TOARGBROW_AVX2
  2013. // 16 pixels.
  2014. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2015. __declspec(naked) void NV12ToARGBRow_AVX2(
  2016. const uint8_t* y_buf,
  2017. const uint8_t* uv_buf,
  2018. uint8_t* dst_argb,
  2019. const struct YuvConstants* yuvconstants,
  2020. int width) {
  2021. __asm {
  2022. push esi
  2023. push ebx
  2024. mov eax, [esp + 8 + 4] // Y
  2025. mov esi, [esp + 8 + 8] // UV
  2026. mov edx, [esp + 8 + 12] // argb
  2027. mov ebx, [esp + 8 + 16] // yuvconstants
  2028. mov ecx, [esp + 8 + 20] // width
  2029. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2030. convertloop:
  2031. READNV12_AVX2
  2032. YUVTORGB_AVX2(ebx)
  2033. STOREARGB_AVX2
  2034. sub ecx, 16
  2035. jg convertloop
  2036. pop ebx
  2037. pop esi
  2038. vzeroupper
  2039. ret
  2040. }
  2041. }
  2042. #endif // HAS_NV12TOARGBROW_AVX2
  2043. #ifdef HAS_NV21TOARGBROW_AVX2
  2044. // 16 pixels.
  2045. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2046. __declspec(naked) void NV21ToARGBRow_AVX2(
  2047. const uint8_t* y_buf,
  2048. const uint8_t* vu_buf,
  2049. uint8_t* dst_argb,
  2050. const struct YuvConstants* yuvconstants,
  2051. int width) {
  2052. __asm {
  2053. push esi
  2054. push ebx
  2055. mov eax, [esp + 8 + 4] // Y
  2056. mov esi, [esp + 8 + 8] // VU
  2057. mov edx, [esp + 8 + 12] // argb
  2058. mov ebx, [esp + 8 + 16] // yuvconstants
  2059. mov ecx, [esp + 8 + 20] // width
  2060. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2061. convertloop:
  2062. READNV21_AVX2
  2063. YUVTORGB_AVX2(ebx)
  2064. STOREARGB_AVX2
  2065. sub ecx, 16
  2066. jg convertloop
  2067. pop ebx
  2068. pop esi
  2069. vzeroupper
  2070. ret
  2071. }
  2072. }
  2073. #endif // HAS_NV21TOARGBROW_AVX2
  2074. #ifdef HAS_YUY2TOARGBROW_AVX2
  2075. // 16 pixels.
  2076. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2077. __declspec(naked) void YUY2ToARGBRow_AVX2(
  2078. const uint8_t* src_yuy2,
  2079. uint8_t* dst_argb,
  2080. const struct YuvConstants* yuvconstants,
  2081. int width) {
  2082. __asm {
  2083. push ebx
  2084. mov eax, [esp + 4 + 4] // yuy2
  2085. mov edx, [esp + 4 + 8] // argb
  2086. mov ebx, [esp + 4 + 12] // yuvconstants
  2087. mov ecx, [esp + 4 + 16] // width
  2088. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2089. convertloop:
  2090. READYUY2_AVX2
  2091. YUVTORGB_AVX2(ebx)
  2092. STOREARGB_AVX2
  2093. sub ecx, 16
  2094. jg convertloop
  2095. pop ebx
  2096. vzeroupper
  2097. ret
  2098. }
  2099. }
  2100. #endif // HAS_YUY2TOARGBROW_AVX2
  2101. #ifdef HAS_UYVYTOARGBROW_AVX2
  2102. // 16 pixels.
  2103. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2104. __declspec(naked) void UYVYToARGBRow_AVX2(
  2105. const uint8_t* src_uyvy,
  2106. uint8_t* dst_argb,
  2107. const struct YuvConstants* yuvconstants,
  2108. int width) {
  2109. __asm {
  2110. push ebx
  2111. mov eax, [esp + 4 + 4] // uyvy
  2112. mov edx, [esp + 4 + 8] // argb
  2113. mov ebx, [esp + 4 + 12] // yuvconstants
  2114. mov ecx, [esp + 4 + 16] // width
  2115. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2116. convertloop:
  2117. READUYVY_AVX2
  2118. YUVTORGB_AVX2(ebx)
  2119. STOREARGB_AVX2
  2120. sub ecx, 16
  2121. jg convertloop
  2122. pop ebx
  2123. vzeroupper
  2124. ret
  2125. }
  2126. }
  2127. #endif // HAS_UYVYTOARGBROW_AVX2
  2128. #ifdef HAS_I422TORGBAROW_AVX2
  2129. // 16 pixels
  2130. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
  2131. __declspec(naked) void I422ToRGBARow_AVX2(
  2132. const uint8_t* y_buf,
  2133. const uint8_t* u_buf,
  2134. const uint8_t* v_buf,
  2135. uint8_t* dst_argb,
  2136. const struct YuvConstants* yuvconstants,
  2137. int width) {
  2138. __asm {
  2139. push esi
  2140. push edi
  2141. push ebx
  2142. mov eax, [esp + 12 + 4] // Y
  2143. mov esi, [esp + 12 + 8] // U
  2144. mov edi, [esp + 12 + 12] // V
  2145. mov edx, [esp + 12 + 16] // abgr
  2146. mov ebx, [esp + 12 + 20] // yuvconstants
  2147. mov ecx, [esp + 12 + 24] // width
  2148. sub edi, esi
  2149. vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
  2150. convertloop:
  2151. READYUV422_AVX2
  2152. YUVTORGB_AVX2(ebx)
  2153. STORERGBA_AVX2
  2154. sub ecx, 16
  2155. jg convertloop
  2156. pop ebx
  2157. pop edi
  2158. pop esi
  2159. vzeroupper
  2160. ret
  2161. }
  2162. }
  2163. #endif // HAS_I422TORGBAROW_AVX2
  2164. #if defined(HAS_I422TOARGBROW_SSSE3)
  2165. // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
  2166. // Allows a conversion with half size scaling.
  2167. // Read 8 UV from 444.
  2168. #define READYUV444 \
  2169. __asm { \
  2170. __asm movq xmm0, qword ptr [esi] /* U */ \
  2171. __asm movq xmm1, qword ptr [esi + edi] /* V */ \
  2172. __asm lea esi, [esi + 8] \
  2173. __asm punpcklbw xmm0, xmm1 /* UV */ \
  2174. __asm movq xmm4, qword ptr [eax] \
  2175. __asm punpcklbw xmm4, xmm4 \
  2176. __asm lea eax, [eax + 8]}
  2177. // Read 4 UV from 422, upsample to 8 UV.
  2178. #define READYUV422 \
  2179. __asm { \
  2180. __asm movd xmm0, [esi] /* U */ \
  2181. __asm movd xmm1, [esi + edi] /* V */ \
  2182. __asm lea esi, [esi + 4] \
  2183. __asm punpcklbw xmm0, xmm1 /* UV */ \
  2184. __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
  2185. __asm movq xmm4, qword ptr [eax] \
  2186. __asm punpcklbw xmm4, xmm4 \
  2187. __asm lea eax, [eax + 8]}
  2188. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
  2189. #define READYUVA422 \
  2190. __asm { \
  2191. __asm movd xmm0, [esi] /* U */ \
  2192. __asm movd xmm1, [esi + edi] /* V */ \
  2193. __asm lea esi, [esi + 4] \
  2194. __asm punpcklbw xmm0, xmm1 /* UV */ \
  2195. __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
  2196. __asm movq xmm4, qword ptr [eax] /* Y */ \
  2197. __asm punpcklbw xmm4, xmm4 \
  2198. __asm lea eax, [eax + 8] \
  2199. __asm movq xmm5, qword ptr [ebp] /* A */ \
  2200. __asm lea ebp, [ebp + 8]}
  2201. // Read 4 UV from NV12, upsample to 8 UV.
  2202. #define READNV12 \
  2203. __asm { \
  2204. __asm movq xmm0, qword ptr [esi] /* UV */ \
  2205. __asm lea esi, [esi + 8] \
  2206. __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
  2207. __asm movq xmm4, qword ptr [eax] \
  2208. __asm punpcklbw xmm4, xmm4 \
  2209. __asm lea eax, [eax + 8]}
  2210. // Read 4 VU from NV21, upsample to 8 UV.
  2211. #define READNV21 \
  2212. __asm { \
  2213. __asm movq xmm0, qword ptr [esi] /* UV */ \
  2214. __asm lea esi, [esi + 8] \
  2215. __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
  2216. __asm movq xmm4, qword ptr [eax] \
  2217. __asm punpcklbw xmm4, xmm4 \
  2218. __asm lea eax, [eax + 8]}
  2219. // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
  2220. #define READYUY2 \
  2221. __asm { \
  2222. __asm movdqu xmm4, [eax] /* YUY2 */ \
  2223. __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
  2224. __asm movdqu xmm0, [eax] /* UV */ \
  2225. __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
  2226. __asm lea eax, [eax + 16]}
  2227. // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
  2228. #define READUYVY \
  2229. __asm { \
  2230. __asm movdqu xmm4, [eax] /* UYVY */ \
  2231. __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
  2232. __asm movdqu xmm0, [eax] /* UV */ \
  2233. __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
  2234. __asm lea eax, [eax + 16]}
  2235. // Convert 8 pixels: 8 UV and 8 Y.
  2236. #define YUVTORGB(YuvConstants) \
  2237. __asm { \
  2238. __asm movdqa xmm1, xmm0 \
  2239. __asm movdqa xmm2, xmm0 \
  2240. __asm movdqa xmm3, xmm0 \
  2241. __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
  2242. __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
  2243. __asm psubw xmm0, xmm1 \
  2244. __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
  2245. __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
  2246. __asm psubw xmm1, xmm2 \
  2247. __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
  2248. __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
  2249. __asm psubw xmm2, xmm3 \
  2250. __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
  2251. __asm paddsw xmm0, xmm4 /* B += Y */ \
  2252. __asm paddsw xmm1, xmm4 /* G += Y */ \
  2253. __asm paddsw xmm2, xmm4 /* R += Y */ \
  2254. __asm psraw xmm0, 6 \
  2255. __asm psraw xmm1, 6 \
  2256. __asm psraw xmm2, 6 \
  2257. __asm packuswb xmm0, xmm0 /* B */ \
  2258. __asm packuswb xmm1, xmm1 /* G */ \
  2259. __asm packuswb xmm2, xmm2 /* R */ \
  2260. }
  2261. // Store 8 ARGB values.
  2262. #define STOREARGB \
  2263. __asm { \
  2264. __asm punpcklbw xmm0, xmm1 /* BG */ \
  2265. __asm punpcklbw xmm2, xmm5 /* RA */ \
  2266. __asm movdqa xmm1, xmm0 \
  2267. __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
  2268. __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
  2269. __asm movdqu 0[edx], xmm0 \
  2270. __asm movdqu 16[edx], xmm1 \
  2271. __asm lea edx, [edx + 32]}
  2272. // Store 8 BGRA values.
  2273. #define STOREBGRA \
  2274. __asm { \
  2275. __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
  2276. __asm punpcklbw xmm1, xmm0 /* GB */ \
  2277. __asm punpcklbw xmm5, xmm2 /* AR */ \
  2278. __asm movdqa xmm0, xmm5 \
  2279. __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
  2280. __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
  2281. __asm movdqu 0[edx], xmm5 \
  2282. __asm movdqu 16[edx], xmm0 \
  2283. __asm lea edx, [edx + 32]}
  2284. // Store 8 RGBA values.
  2285. #define STORERGBA \
  2286. __asm { \
  2287. __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
  2288. __asm punpcklbw xmm1, xmm2 /* GR */ \
  2289. __asm punpcklbw xmm5, xmm0 /* AB */ \
  2290. __asm movdqa xmm0, xmm5 \
  2291. __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
  2292. __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
  2293. __asm movdqu 0[edx], xmm5 \
  2294. __asm movdqu 16[edx], xmm0 \
  2295. __asm lea edx, [edx + 32]}
  2296. // Store 8 RGB24 values.
  2297. #define STORERGB24 \
  2298. __asm {/* Weave into RRGB */ \
  2299. __asm punpcklbw xmm0, xmm1 /* BG */ \
  2300. __asm punpcklbw xmm2, xmm2 /* RR */ \
  2301. __asm movdqa xmm1, xmm0 \
  2302. __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
  2303. __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
  2304. __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
  2305. __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
  2306. __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
  2307. __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
  2308. __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
  2309. __asm lea edx, [edx + 24]}
  2310. // Store 8 RGB565 values.
  2311. #define STORERGB565 \
  2312. __asm {/* Weave into RRGB */ \
  2313. __asm punpcklbw xmm0, xmm1 /* BG */ \
  2314. __asm punpcklbw xmm2, xmm2 /* RR */ \
  2315. __asm movdqa xmm1, xmm0 \
  2316. __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
  2317. __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
  2318. __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
  2319. __asm movdqa xmm2, xmm0 /* G */ \
  2320. __asm pslld xmm0, 8 /* R */ \
  2321. __asm psrld xmm3, 3 /* B */ \
  2322. __asm psrld xmm2, 5 /* G */ \
  2323. __asm psrad xmm0, 16 /* R */ \
  2324. __asm pand xmm3, xmm5 /* B */ \
  2325. __asm pand xmm2, xmm6 /* G */ \
  2326. __asm pand xmm0, xmm7 /* R */ \
  2327. __asm por xmm3, xmm2 /* BG */ \
  2328. __asm por xmm0, xmm3 /* BGR */ \
  2329. __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
  2330. __asm movdqa xmm2, xmm1 /* G */ \
  2331. __asm pslld xmm1, 8 /* R */ \
  2332. __asm psrld xmm3, 3 /* B */ \
  2333. __asm psrld xmm2, 5 /* G */ \
  2334. __asm psrad xmm1, 16 /* R */ \
  2335. __asm pand xmm3, xmm5 /* B */ \
  2336. __asm pand xmm2, xmm6 /* G */ \
  2337. __asm pand xmm1, xmm7 /* R */ \
  2338. __asm por xmm3, xmm2 /* BG */ \
  2339. __asm por xmm1, xmm3 /* BGR */ \
  2340. __asm packssdw xmm0, xmm1 \
  2341. __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
  2342. __asm lea edx, [edx + 16]}
  2343. // 8 pixels.
  2344. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
  2345. __declspec(naked) void I444ToARGBRow_SSSE3(
  2346. const uint8_t* y_buf,
  2347. const uint8_t* u_buf,
  2348. const uint8_t* v_buf,
  2349. uint8_t* dst_argb,
  2350. const struct YuvConstants* yuvconstants,
  2351. int width) {
  2352. __asm {
  2353. push esi
  2354. push edi
  2355. push ebx
  2356. mov eax, [esp + 12 + 4] // Y
  2357. mov esi, [esp + 12 + 8] // U
  2358. mov edi, [esp + 12 + 12] // V
  2359. mov edx, [esp + 12 + 16] // argb
  2360. mov ebx, [esp + 12 + 20] // yuvconstants
  2361. mov ecx, [esp + 12 + 24] // width
  2362. sub edi, esi
  2363. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2364. convertloop:
  2365. READYUV444
  2366. YUVTORGB(ebx)
  2367. STOREARGB
  2368. sub ecx, 8
  2369. jg convertloop
  2370. pop ebx
  2371. pop edi
  2372. pop esi
  2373. ret
  2374. }
  2375. }
  2376. // 8 pixels.
  2377. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
  2378. __declspec(naked) void I422ToRGB24Row_SSSE3(
  2379. const uint8_t* y_buf,
  2380. const uint8_t* u_buf,
  2381. const uint8_t* v_buf,
  2382. uint8_t* dst_rgb24,
  2383. const struct YuvConstants* yuvconstants,
  2384. int width) {
  2385. __asm {
  2386. push esi
  2387. push edi
  2388. push ebx
  2389. mov eax, [esp + 12 + 4] // Y
  2390. mov esi, [esp + 12 + 8] // U
  2391. mov edi, [esp + 12 + 12] // V
  2392. mov edx, [esp + 12 + 16] // argb
  2393. mov ebx, [esp + 12 + 20] // yuvconstants
  2394. mov ecx, [esp + 12 + 24] // width
  2395. sub edi, esi
  2396. movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
  2397. movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
  2398. convertloop:
  2399. READYUV422
  2400. YUVTORGB(ebx)
  2401. STORERGB24
  2402. sub ecx, 8
  2403. jg convertloop
  2404. pop ebx
  2405. pop edi
  2406. pop esi
  2407. ret
  2408. }
  2409. }
  2410. // 8 pixels
  2411. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
  2412. __declspec(naked) void I422ToRGB565Row_SSSE3(
  2413. const uint8_t* y_buf,
  2414. const uint8_t* u_buf,
  2415. const uint8_t* v_buf,
  2416. uint8_t* rgb565_buf,
  2417. const struct YuvConstants* yuvconstants,
  2418. int width) {
  2419. __asm {
  2420. push esi
  2421. push edi
  2422. push ebx
  2423. mov eax, [esp + 12 + 4] // Y
  2424. mov esi, [esp + 12 + 8] // U
  2425. mov edi, [esp + 12 + 12] // V
  2426. mov edx, [esp + 12 + 16] // argb
  2427. mov ebx, [esp + 12 + 20] // yuvconstants
  2428. mov ecx, [esp + 12 + 24] // width
  2429. sub edi, esi
  2430. pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
  2431. psrld xmm5, 27
  2432. pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
  2433. psrld xmm6, 26
  2434. pslld xmm6, 5
  2435. pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
  2436. pslld xmm7, 11
  2437. convertloop:
  2438. READYUV422
  2439. YUVTORGB(ebx)
  2440. STORERGB565
  2441. sub ecx, 8
  2442. jg convertloop
  2443. pop ebx
  2444. pop edi
  2445. pop esi
  2446. ret
  2447. }
  2448. }
  2449. // 8 pixels.
  2450. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2451. __declspec(naked) void I422ToARGBRow_SSSE3(
  2452. const uint8_t* y_buf,
  2453. const uint8_t* u_buf,
  2454. const uint8_t* v_buf,
  2455. uint8_t* dst_argb,
  2456. const struct YuvConstants* yuvconstants,
  2457. int width) {
  2458. __asm {
  2459. push esi
  2460. push edi
  2461. push ebx
  2462. mov eax, [esp + 12 + 4] // Y
  2463. mov esi, [esp + 12 + 8] // U
  2464. mov edi, [esp + 12 + 12] // V
  2465. mov edx, [esp + 12 + 16] // argb
  2466. mov ebx, [esp + 12 + 20] // yuvconstants
  2467. mov ecx, [esp + 12 + 24] // width
  2468. sub edi, esi
  2469. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2470. convertloop:
  2471. READYUV422
  2472. YUVTORGB(ebx)
  2473. STOREARGB
  2474. sub ecx, 8
  2475. jg convertloop
  2476. pop ebx
  2477. pop edi
  2478. pop esi
  2479. ret
  2480. }
  2481. }
  2482. // 8 pixels.
  2483. // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
  2484. __declspec(naked) void I422AlphaToARGBRow_SSSE3(
  2485. const uint8_t* y_buf,
  2486. const uint8_t* u_buf,
  2487. const uint8_t* v_buf,
  2488. const uint8_t* a_buf,
  2489. uint8_t* dst_argb,
  2490. const struct YuvConstants* yuvconstants,
  2491. int width) {
  2492. __asm {
  2493. push esi
  2494. push edi
  2495. push ebx
  2496. push ebp
  2497. mov eax, [esp + 16 + 4] // Y
  2498. mov esi, [esp + 16 + 8] // U
  2499. mov edi, [esp + 16 + 12] // V
  2500. mov ebp, [esp + 16 + 16] // A
  2501. mov edx, [esp + 16 + 20] // argb
  2502. mov ebx, [esp + 16 + 24] // yuvconstants
  2503. mov ecx, [esp + 16 + 28] // width
  2504. sub edi, esi
  2505. convertloop:
  2506. READYUVA422
  2507. YUVTORGB(ebx)
  2508. STOREARGB
  2509. sub ecx, 8
  2510. jg convertloop
  2511. pop ebp
  2512. pop ebx
  2513. pop edi
  2514. pop esi
  2515. ret
  2516. }
  2517. }
  2518. // 8 pixels.
  2519. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2520. __declspec(naked) void NV12ToARGBRow_SSSE3(
  2521. const uint8_t* y_buf,
  2522. const uint8_t* uv_buf,
  2523. uint8_t* dst_argb,
  2524. const struct YuvConstants* yuvconstants,
  2525. int width) {
  2526. __asm {
  2527. push esi
  2528. push ebx
  2529. mov eax, [esp + 8 + 4] // Y
  2530. mov esi, [esp + 8 + 8] // UV
  2531. mov edx, [esp + 8 + 12] // argb
  2532. mov ebx, [esp + 8 + 16] // yuvconstants
  2533. mov ecx, [esp + 8 + 20] // width
  2534. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2535. convertloop:
  2536. READNV12
  2537. YUVTORGB(ebx)
  2538. STOREARGB
  2539. sub ecx, 8
  2540. jg convertloop
  2541. pop ebx
  2542. pop esi
  2543. ret
  2544. }
  2545. }
  2546. // 8 pixels.
  2547. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
  2548. __declspec(naked) void NV21ToARGBRow_SSSE3(
  2549. const uint8_t* y_buf,
  2550. const uint8_t* vu_buf,
  2551. uint8_t* dst_argb,
  2552. const struct YuvConstants* yuvconstants,
  2553. int width) {
  2554. __asm {
  2555. push esi
  2556. push ebx
  2557. mov eax, [esp + 8 + 4] // Y
  2558. mov esi, [esp + 8 + 8] // VU
  2559. mov edx, [esp + 8 + 12] // argb
  2560. mov ebx, [esp + 8 + 16] // yuvconstants
  2561. mov ecx, [esp + 8 + 20] // width
  2562. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2563. convertloop:
  2564. READNV21
  2565. YUVTORGB(ebx)
  2566. STOREARGB
  2567. sub ecx, 8
  2568. jg convertloop
  2569. pop ebx
  2570. pop esi
  2571. ret
  2572. }
  2573. }
  2574. // 8 pixels.
  2575. // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
  2576. __declspec(naked) void YUY2ToARGBRow_SSSE3(
  2577. const uint8_t* src_yuy2,
  2578. uint8_t* dst_argb,
  2579. const struct YuvConstants* yuvconstants,
  2580. int width) {
  2581. __asm {
  2582. push ebx
  2583. mov eax, [esp + 4 + 4] // yuy2
  2584. mov edx, [esp + 4 + 8] // argb
  2585. mov ebx, [esp + 4 + 12] // yuvconstants
  2586. mov ecx, [esp + 4 + 16] // width
  2587. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2588. convertloop:
  2589. READYUY2
  2590. YUVTORGB(ebx)
  2591. STOREARGB
  2592. sub ecx, 8
  2593. jg convertloop
  2594. pop ebx
  2595. ret
  2596. }
  2597. }
  2598. // 8 pixels.
  2599. // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
  2600. __declspec(naked) void UYVYToARGBRow_SSSE3(
  2601. const uint8_t* src_uyvy,
  2602. uint8_t* dst_argb,
  2603. const struct YuvConstants* yuvconstants,
  2604. int width) {
  2605. __asm {
  2606. push ebx
  2607. mov eax, [esp + 4 + 4] // uyvy
  2608. mov edx, [esp + 4 + 8] // argb
  2609. mov ebx, [esp + 4 + 12] // yuvconstants
  2610. mov ecx, [esp + 4 + 16] // width
  2611. pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
  2612. convertloop:
  2613. READUYVY
  2614. YUVTORGB(ebx)
  2615. STOREARGB
  2616. sub ecx, 8
  2617. jg convertloop
  2618. pop ebx
  2619. ret
  2620. }
  2621. }
  2622. __declspec(naked) void I422ToRGBARow_SSSE3(
  2623. const uint8_t* y_buf,
  2624. const uint8_t* u_buf,
  2625. const uint8_t* v_buf,
  2626. uint8_t* dst_rgba,
  2627. const struct YuvConstants* yuvconstants,
  2628. int width) {
  2629. __asm {
  2630. push esi
  2631. push edi
  2632. push ebx
  2633. mov eax, [esp + 12 + 4] // Y
  2634. mov esi, [esp + 12 + 8] // U
  2635. mov edi, [esp + 12 + 12] // V
  2636. mov edx, [esp + 12 + 16] // argb
  2637. mov ebx, [esp + 12 + 20] // yuvconstants
  2638. mov ecx, [esp + 12 + 24] // width
  2639. sub edi, esi
  2640. convertloop:
  2641. READYUV422
  2642. YUVTORGB(ebx)
  2643. STORERGBA
  2644. sub ecx, 8
  2645. jg convertloop
  2646. pop ebx
  2647. pop edi
  2648. pop esi
  2649. ret
  2650. }
  2651. }
  2652. #endif // HAS_I422TOARGBROW_SSSE3
  2653. // I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter
  2654. #ifdef HAS_I400TOARGBROW_SSE2
  2655. // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
  2656. __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
  2657. uint8_t* rgb_buf,
  2658. const struct YuvConstants*,
  2659. int width) {
  2660. __asm {
  2661. mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
  2662. movd xmm2, eax
  2663. pshufd xmm2, xmm2,0
  2664. mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
  2665. movd xmm3, eax
  2666. pshufd xmm3, xmm3, 0
  2667. pcmpeqb xmm4, xmm4 // generate mask 0xff000000
  2668. pslld xmm4, 24
  2669. mov eax, [esp + 4] // Y
  2670. mov edx, [esp + 8] // rgb
  2671. mov ecx, [esp + 12] // width
  2672. convertloop:
  2673. // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
  2674. movq xmm0, qword ptr [eax]
  2675. lea eax, [eax + 8]
  2676. punpcklbw xmm0, xmm0 // Y.Y
  2677. pmulhuw xmm0, xmm2
  2678. psubusw xmm0, xmm3
  2679. psrlw xmm0, 6
  2680. packuswb xmm0, xmm0 // G
  2681. // Step 2: Weave into ARGB
  2682. punpcklbw xmm0, xmm0 // GG
  2683. movdqa xmm1, xmm0
  2684. punpcklwd xmm0, xmm0 // BGRA first 4 pixels
  2685. punpckhwd xmm1, xmm1 // BGRA next 4 pixels
  2686. por xmm0, xmm4
  2687. por xmm1, xmm4
  2688. movdqu [edx], xmm0
  2689. movdqu [edx + 16], xmm1
  2690. lea edx, [edx + 32]
  2691. sub ecx, 8
  2692. jg convertloop
  2693. ret
  2694. }
  2695. }
  2696. #endif // HAS_I400TOARGBROW_SSE2
  2697. #ifdef HAS_I400TOARGBROW_AVX2
  2698. // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
  2699. // note: vpunpcklbw mutates and vpackuswb unmutates.
  2700. __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
  2701. uint8_t* rgb_buf,
  2702. const struct YuvConstants*,
  2703. int width) {
  2704. __asm {
  2705. mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
  2706. vmovd xmm2, eax
  2707. vbroadcastss ymm2, xmm2
  2708. mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
  2709. vmovd xmm3, eax
  2710. vbroadcastss ymm3, xmm3
  2711. vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
  2712. vpslld ymm4, ymm4, 24
  2713. mov eax, [esp + 4] // Y
  2714. mov edx, [esp + 8] // rgb
  2715. mov ecx, [esp + 12] // width
  2716. convertloop:
  2717. // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
  2718. vmovdqu xmm0, [eax]
  2719. lea eax, [eax + 16]
  2720. vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
  2721. vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
  2722. vpmulhuw ymm0, ymm0, ymm2
  2723. vpsubusw ymm0, ymm0, ymm3
  2724. vpsrlw ymm0, ymm0, 6
  2725. vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
  2726. // TODO(fbarchard): Weave alpha with unpack.
  2727. // Step 2: Weave into ARGB
  2728. vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
  2729. vpermq ymm1, ymm1, 0xd8
  2730. vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
  2731. vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
  2732. vpor ymm0, ymm0, ymm4
  2733. vpor ymm1, ymm1, ymm4
  2734. vmovdqu [edx], ymm0
  2735. vmovdqu [edx + 32], ymm1
  2736. lea edx, [edx + 64]
  2737. sub ecx, 16
  2738. jg convertloop
  2739. vzeroupper
  2740. ret
  2741. }
  2742. }
  2743. #endif // HAS_I400TOARGBROW_AVX2
  2744. #ifdef HAS_MIRRORROW_SSSE3
  2745. // Shuffle table for reversing the bytes.
  2746. static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
  2747. 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
  2748. // TODO(fbarchard): Replace lea with -16 offset.
  2749. __declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
  2750. uint8_t* dst,
  2751. int width) {
  2752. __asm {
  2753. mov eax, [esp + 4] // src
  2754. mov edx, [esp + 8] // dst
  2755. mov ecx, [esp + 12] // width
  2756. movdqa xmm5, xmmword ptr kShuffleMirror
  2757. convertloop:
  2758. movdqu xmm0, [eax - 16 + ecx]
  2759. pshufb xmm0, xmm5
  2760. movdqu [edx], xmm0
  2761. lea edx, [edx + 16]
  2762. sub ecx, 16
  2763. jg convertloop
  2764. ret
  2765. }
  2766. }
  2767. #endif // HAS_MIRRORROW_SSSE3
  2768. #ifdef HAS_MIRRORROW_AVX2
  2769. __declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
  2770. uint8_t* dst,
  2771. int width) {
  2772. __asm {
  2773. mov eax, [esp + 4] // src
  2774. mov edx, [esp + 8] // dst
  2775. mov ecx, [esp + 12] // width
  2776. vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
  2777. convertloop:
  2778. vmovdqu ymm0, [eax - 32 + ecx]
  2779. vpshufb ymm0, ymm0, ymm5
  2780. vpermq ymm0, ymm0, 0x4e // swap high and low halfs
  2781. vmovdqu [edx], ymm0
  2782. lea edx, [edx + 32]
  2783. sub ecx, 32
  2784. jg convertloop
  2785. vzeroupper
  2786. ret
  2787. }
  2788. }
  2789. #endif // HAS_MIRRORROW_AVX2
  2790. #ifdef HAS_MIRRORSPLITUVROW_SSSE3
  2791. // Shuffle table for reversing the bytes of UV channels.
  2792. static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
  2793. 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
  2794. __declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src,
  2795. uint8_t* dst_u,
  2796. uint8_t* dst_v,
  2797. int width) {
  2798. __asm {
  2799. push edi
  2800. mov eax, [esp + 4 + 4] // src
  2801. mov edx, [esp + 4 + 8] // dst_u
  2802. mov edi, [esp + 4 + 12] // dst_v
  2803. mov ecx, [esp + 4 + 16] // width
  2804. movdqa xmm1, xmmword ptr kShuffleMirrorUV
  2805. lea eax, [eax + ecx * 2 - 16]
  2806. sub edi, edx
  2807. convertloop:
  2808. movdqu xmm0, [eax]
  2809. lea eax, [eax - 16]
  2810. pshufb xmm0, xmm1
  2811. movlpd qword ptr [edx], xmm0
  2812. movhpd qword ptr [edx + edi], xmm0
  2813. lea edx, [edx + 8]
  2814. sub ecx, 8
  2815. jg convertloop
  2816. pop edi
  2817. ret
  2818. }
  2819. }
  2820. #endif // HAS_MIRRORSPLITUVROW_SSSE3
  2821. #ifdef HAS_ARGBMIRRORROW_SSE2
  2822. __declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
  2823. uint8_t* dst,
  2824. int width) {
  2825. __asm {
  2826. mov eax, [esp + 4] // src
  2827. mov edx, [esp + 8] // dst
  2828. mov ecx, [esp + 12] // width
  2829. lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
  2830. convertloop:
  2831. movdqu xmm0, [eax]
  2832. lea eax, [eax - 16]
  2833. pshufd xmm0, xmm0, 0x1b
  2834. movdqu [edx], xmm0
  2835. lea edx, [edx + 16]
  2836. sub ecx, 4
  2837. jg convertloop
  2838. ret
  2839. }
  2840. }
  2841. #endif // HAS_ARGBMIRRORROW_SSE2
  2842. #ifdef HAS_ARGBMIRRORROW_AVX2
  2843. // Shuffle table for reversing the bytes.
  2844. static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
  2845. __declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
  2846. uint8_t* dst,
  2847. int width) {
  2848. __asm {
  2849. mov eax, [esp + 4] // src
  2850. mov edx, [esp + 8] // dst
  2851. mov ecx, [esp + 12] // width
  2852. vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
  2853. convertloop:
  2854. vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
  2855. vmovdqu [edx], ymm0
  2856. lea edx, [edx + 32]
  2857. sub ecx, 8
  2858. jg convertloop
  2859. vzeroupper
  2860. ret
  2861. }
  2862. }
  2863. #endif // HAS_ARGBMIRRORROW_AVX2
  2864. #ifdef HAS_SPLITUVROW_SSE2
  2865. __declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
  2866. uint8_t* dst_u,
  2867. uint8_t* dst_v,
  2868. int width) {
  2869. __asm {
  2870. push edi
  2871. mov eax, [esp + 4 + 4] // src_uv
  2872. mov edx, [esp + 4 + 8] // dst_u
  2873. mov edi, [esp + 4 + 12] // dst_v
  2874. mov ecx, [esp + 4 + 16] // width
  2875. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  2876. psrlw xmm5, 8
  2877. sub edi, edx
  2878. convertloop:
  2879. movdqu xmm0, [eax]
  2880. movdqu xmm1, [eax + 16]
  2881. lea eax, [eax + 32]
  2882. movdqa xmm2, xmm0
  2883. movdqa xmm3, xmm1
  2884. pand xmm0, xmm5 // even bytes
  2885. pand xmm1, xmm5
  2886. packuswb xmm0, xmm1
  2887. psrlw xmm2, 8 // odd bytes
  2888. psrlw xmm3, 8
  2889. packuswb xmm2, xmm3
  2890. movdqu [edx], xmm0
  2891. movdqu [edx + edi], xmm2
  2892. lea edx, [edx + 16]
  2893. sub ecx, 16
  2894. jg convertloop
  2895. pop edi
  2896. ret
  2897. }
  2898. }
  2899. #endif // HAS_SPLITUVROW_SSE2
  2900. #ifdef HAS_SPLITUVROW_AVX2
  2901. __declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
  2902. uint8_t* dst_u,
  2903. uint8_t* dst_v,
  2904. int width) {
  2905. __asm {
  2906. push edi
  2907. mov eax, [esp + 4 + 4] // src_uv
  2908. mov edx, [esp + 4 + 8] // dst_u
  2909. mov edi, [esp + 4 + 12] // dst_v
  2910. mov ecx, [esp + 4 + 16] // width
  2911. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  2912. vpsrlw ymm5, ymm5, 8
  2913. sub edi, edx
  2914. convertloop:
  2915. vmovdqu ymm0, [eax]
  2916. vmovdqu ymm1, [eax + 32]
  2917. lea eax, [eax + 64]
  2918. vpsrlw ymm2, ymm0, 8 // odd bytes
  2919. vpsrlw ymm3, ymm1, 8
  2920. vpand ymm0, ymm0, ymm5 // even bytes
  2921. vpand ymm1, ymm1, ymm5
  2922. vpackuswb ymm0, ymm0, ymm1
  2923. vpackuswb ymm2, ymm2, ymm3
  2924. vpermq ymm0, ymm0, 0xd8
  2925. vpermq ymm2, ymm2, 0xd8
  2926. vmovdqu [edx], ymm0
  2927. vmovdqu [edx + edi], ymm2
  2928. lea edx, [edx + 32]
  2929. sub ecx, 32
  2930. jg convertloop
  2931. pop edi
  2932. vzeroupper
  2933. ret
  2934. }
  2935. }
  2936. #endif // HAS_SPLITUVROW_AVX2
  2937. #ifdef HAS_MERGEUVROW_SSE2
  2938. __declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
  2939. const uint8_t* src_v,
  2940. uint8_t* dst_uv,
  2941. int width) {
  2942. __asm {
  2943. push edi
  2944. mov eax, [esp + 4 + 4] // src_u
  2945. mov edx, [esp + 4 + 8] // src_v
  2946. mov edi, [esp + 4 + 12] // dst_uv
  2947. mov ecx, [esp + 4 + 16] // width
  2948. sub edx, eax
  2949. convertloop:
  2950. movdqu xmm0, [eax] // read 16 U's
  2951. movdqu xmm1, [eax + edx] // and 16 V's
  2952. lea eax, [eax + 16]
  2953. movdqa xmm2, xmm0
  2954. punpcklbw xmm0, xmm1 // first 8 UV pairs
  2955. punpckhbw xmm2, xmm1 // next 8 UV pairs
  2956. movdqu [edi], xmm0
  2957. movdqu [edi + 16], xmm2
  2958. lea edi, [edi + 32]
  2959. sub ecx, 16
  2960. jg convertloop
  2961. pop edi
  2962. ret
  2963. }
  2964. }
  2965. #endif // HAS_MERGEUVROW_SSE2
  2966. #ifdef HAS_MERGEUVROW_AVX2
  2967. __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
  2968. const uint8_t* src_v,
  2969. uint8_t* dst_uv,
  2970. int width) {
  2971. __asm {
  2972. push edi
  2973. mov eax, [esp + 4 + 4] // src_u
  2974. mov edx, [esp + 4 + 8] // src_v
  2975. mov edi, [esp + 4 + 12] // dst_uv
  2976. mov ecx, [esp + 4 + 16] // width
  2977. sub edx, eax
  2978. convertloop:
  2979. vmovdqu ymm0, [eax] // read 32 U's
  2980. vmovdqu ymm1, [eax + edx] // and 32 V's
  2981. lea eax, [eax + 32]
  2982. vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
  2983. vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
  2984. vextractf128 [edi], ymm2, 0 // bytes 0..15
  2985. vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
  2986. vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
  2987. vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
  2988. lea edi, [edi + 64]
  2989. sub ecx, 32
  2990. jg convertloop
  2991. pop edi
  2992. vzeroupper
  2993. ret
  2994. }
  2995. }
  2996. #endif // HAS_MERGEUVROW_AVX2
  2997. #ifdef HAS_COPYROW_SSE2
  2998. // CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
  2999. __declspec(naked) void CopyRow_SSE2(const uint8_t* src,
  3000. uint8_t* dst,
  3001. int width) {
  3002. __asm {
  3003. mov eax, [esp + 4] // src
  3004. mov edx, [esp + 8] // dst
  3005. mov ecx, [esp + 12] // width
  3006. test eax, 15
  3007. jne convertloopu
  3008. test edx, 15
  3009. jne convertloopu
  3010. convertloopa:
  3011. movdqa xmm0, [eax]
  3012. movdqa xmm1, [eax + 16]
  3013. lea eax, [eax + 32]
  3014. movdqa [edx], xmm0
  3015. movdqa [edx + 16], xmm1
  3016. lea edx, [edx + 32]
  3017. sub ecx, 32
  3018. jg convertloopa
  3019. ret
  3020. convertloopu:
  3021. movdqu xmm0, [eax]
  3022. movdqu xmm1, [eax + 16]
  3023. lea eax, [eax + 32]
  3024. movdqu [edx], xmm0
  3025. movdqu [edx + 16], xmm1
  3026. lea edx, [edx + 32]
  3027. sub ecx, 32
  3028. jg convertloopu
  3029. ret
  3030. }
  3031. }
  3032. #endif // HAS_COPYROW_SSE2
  3033. #ifdef HAS_COPYROW_AVX
  3034. // CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
  3035. __declspec(naked) void CopyRow_AVX(const uint8_t* src,
  3036. uint8_t* dst,
  3037. int width) {
  3038. __asm {
  3039. mov eax, [esp + 4] // src
  3040. mov edx, [esp + 8] // dst
  3041. mov ecx, [esp + 12] // width
  3042. convertloop:
  3043. vmovdqu ymm0, [eax]
  3044. vmovdqu ymm1, [eax + 32]
  3045. lea eax, [eax + 64]
  3046. vmovdqu [edx], ymm0
  3047. vmovdqu [edx + 32], ymm1
  3048. lea edx, [edx + 64]
  3049. sub ecx, 64
  3050. jg convertloop
  3051. vzeroupper
  3052. ret
  3053. }
  3054. }
  3055. #endif // HAS_COPYROW_AVX
  3056. // Multiple of 1.
  3057. __declspec(naked) void CopyRow_ERMS(const uint8_t* src,
  3058. uint8_t* dst,
  3059. int width) {
  3060. __asm {
  3061. mov eax, esi
  3062. mov edx, edi
  3063. mov esi, [esp + 4] // src
  3064. mov edi, [esp + 8] // dst
  3065. mov ecx, [esp + 12] // width
  3066. rep movsb
  3067. mov edi, edx
  3068. mov esi, eax
  3069. ret
  3070. }
  3071. }
  3072. #ifdef HAS_ARGBCOPYALPHAROW_SSE2
  3073. // width in pixels
  3074. __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
  3075. uint8_t* dst,
  3076. int width) {
  3077. __asm {
  3078. mov eax, [esp + 4] // src
  3079. mov edx, [esp + 8] // dst
  3080. mov ecx, [esp + 12] // width
  3081. pcmpeqb xmm0, xmm0 // generate mask 0xff000000
  3082. pslld xmm0, 24
  3083. pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
  3084. psrld xmm1, 8
  3085. convertloop:
  3086. movdqu xmm2, [eax]
  3087. movdqu xmm3, [eax + 16]
  3088. lea eax, [eax + 32]
  3089. movdqu xmm4, [edx]
  3090. movdqu xmm5, [edx + 16]
  3091. pand xmm2, xmm0
  3092. pand xmm3, xmm0
  3093. pand xmm4, xmm1
  3094. pand xmm5, xmm1
  3095. por xmm2, xmm4
  3096. por xmm3, xmm5
  3097. movdqu [edx], xmm2
  3098. movdqu [edx + 16], xmm3
  3099. lea edx, [edx + 32]
  3100. sub ecx, 8
  3101. jg convertloop
  3102. ret
  3103. }
  3104. }
  3105. #endif // HAS_ARGBCOPYALPHAROW_SSE2
  3106. #ifdef HAS_ARGBCOPYALPHAROW_AVX2
  3107. // width in pixels
  3108. __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
  3109. uint8_t* dst,
  3110. int width) {
  3111. __asm {
  3112. mov eax, [esp + 4] // src
  3113. mov edx, [esp + 8] // dst
  3114. mov ecx, [esp + 12] // width
  3115. vpcmpeqb ymm0, ymm0, ymm0
  3116. vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
  3117. convertloop:
  3118. vmovdqu ymm1, [eax]
  3119. vmovdqu ymm2, [eax + 32]
  3120. lea eax, [eax + 64]
  3121. vpblendvb ymm1, ymm1, [edx], ymm0
  3122. vpblendvb ymm2, ymm2, [edx + 32], ymm0
  3123. vmovdqu [edx], ymm1
  3124. vmovdqu [edx + 32], ymm2
  3125. lea edx, [edx + 64]
  3126. sub ecx, 16
  3127. jg convertloop
  3128. vzeroupper
  3129. ret
  3130. }
  3131. }
  3132. #endif // HAS_ARGBCOPYALPHAROW_AVX2
  3133. #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
  3134. // width in pixels
  3135. __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
  3136. uint8_t* dst_a,
  3137. int width) {
  3138. __asm {
  3139. mov eax, [esp + 4] // src_argb
  3140. mov edx, [esp + 8] // dst_a
  3141. mov ecx, [esp + 12] // width
  3142. extractloop:
  3143. movdqu xmm0, [eax]
  3144. movdqu xmm1, [eax + 16]
  3145. lea eax, [eax + 32]
  3146. psrld xmm0, 24
  3147. psrld xmm1, 24
  3148. packssdw xmm0, xmm1
  3149. packuswb xmm0, xmm0
  3150. movq qword ptr [edx], xmm0
  3151. lea edx, [edx + 8]
  3152. sub ecx, 8
  3153. jg extractloop
  3154. ret
  3155. }
  3156. }
  3157. #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
  3158. #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
  3159. // width in pixels
  3160. __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
  3161. uint8_t* dst_a,
  3162. int width) {
  3163. __asm {
  3164. mov eax, [esp + 4] // src_argb
  3165. mov edx, [esp + 8] // dst_a
  3166. mov ecx, [esp + 12] // width
  3167. vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX
  3168. extractloop:
  3169. vmovdqu ymm0, [eax]
  3170. vmovdqu ymm1, [eax + 32]
  3171. vpsrld ymm0, ymm0, 24
  3172. vpsrld ymm1, ymm1, 24
  3173. vmovdqu ymm2, [eax + 64]
  3174. vmovdqu ymm3, [eax + 96]
  3175. lea eax, [eax + 128]
  3176. vpackssdw ymm0, ymm0, ymm1 // mutates
  3177. vpsrld ymm2, ymm2, 24
  3178. vpsrld ymm3, ymm3, 24
  3179. vpackssdw ymm2, ymm2, ymm3 // mutates
  3180. vpackuswb ymm0, ymm0, ymm2 // mutates
  3181. vpermd ymm0, ymm4, ymm0 // unmutate
  3182. vmovdqu [edx], ymm0
  3183. lea edx, [edx + 32]
  3184. sub ecx, 32
  3185. jg extractloop
  3186. vzeroupper
  3187. ret
  3188. }
  3189. }
  3190. #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
  3191. #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
  3192. // width in pixels
  3193. __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
  3194. uint8_t* dst,
  3195. int width) {
  3196. __asm {
  3197. mov eax, [esp + 4] // src
  3198. mov edx, [esp + 8] // dst
  3199. mov ecx, [esp + 12] // width
  3200. pcmpeqb xmm0, xmm0 // generate mask 0xff000000
  3201. pslld xmm0, 24
  3202. pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
  3203. psrld xmm1, 8
  3204. convertloop:
  3205. movq xmm2, qword ptr [eax] // 8 Y's
  3206. lea eax, [eax + 8]
  3207. punpcklbw xmm2, xmm2
  3208. punpckhwd xmm3, xmm2
  3209. punpcklwd xmm2, xmm2
  3210. movdqu xmm4, [edx]
  3211. movdqu xmm5, [edx + 16]
  3212. pand xmm2, xmm0
  3213. pand xmm3, xmm0
  3214. pand xmm4, xmm1
  3215. pand xmm5, xmm1
  3216. por xmm2, xmm4
  3217. por xmm3, xmm5
  3218. movdqu [edx], xmm2
  3219. movdqu [edx + 16], xmm3
  3220. lea edx, [edx + 32]
  3221. sub ecx, 8
  3222. jg convertloop
  3223. ret
  3224. }
  3225. }
  3226. #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
  3227. #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
  3228. // width in pixels
  3229. __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
  3230. uint8_t* dst,
  3231. int width) {
  3232. __asm {
  3233. mov eax, [esp + 4] // src
  3234. mov edx, [esp + 8] // dst
  3235. mov ecx, [esp + 12] // width
  3236. vpcmpeqb ymm0, ymm0, ymm0
  3237. vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
  3238. convertloop:
  3239. vpmovzxbd ymm1, qword ptr [eax]
  3240. vpmovzxbd ymm2, qword ptr [eax + 8]
  3241. lea eax, [eax + 16]
  3242. vpslld ymm1, ymm1, 24
  3243. vpslld ymm2, ymm2, 24
  3244. vpblendvb ymm1, ymm1, [edx], ymm0
  3245. vpblendvb ymm2, ymm2, [edx + 32], ymm0
  3246. vmovdqu [edx], ymm1
  3247. vmovdqu [edx + 32], ymm2
  3248. lea edx, [edx + 64]
  3249. sub ecx, 16
  3250. jg convertloop
  3251. vzeroupper
  3252. ret
  3253. }
  3254. }
  3255. #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
  3256. #ifdef HAS_SETROW_X86
  3257. // Write 'width' bytes using an 8 bit value repeated.
  3258. // width should be multiple of 4.
  3259. __declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
  3260. __asm {
  3261. movzx eax, byte ptr [esp + 8] // v8
  3262. mov edx, 0x01010101 // Duplicate byte to all bytes.
  3263. mul edx // overwrites edx with upper part of result.
  3264. mov edx, edi
  3265. mov edi, [esp + 4] // dst
  3266. mov ecx, [esp + 12] // width
  3267. shr ecx, 2
  3268. rep stosd
  3269. mov edi, edx
  3270. ret
  3271. }
  3272. }
  3273. // Write 'width' bytes using an 8 bit value repeated.
  3274. __declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
  3275. __asm {
  3276. mov edx, edi
  3277. mov edi, [esp + 4] // dst
  3278. mov eax, [esp + 8] // v8
  3279. mov ecx, [esp + 12] // width
  3280. rep stosb
  3281. mov edi, edx
  3282. ret
  3283. }
  3284. }
  3285. // Write 'width' 32 bit values.
  3286. __declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
  3287. uint32_t v32,
  3288. int width) {
  3289. __asm {
  3290. mov edx, edi
  3291. mov edi, [esp + 4] // dst
  3292. mov eax, [esp + 8] // v32
  3293. mov ecx, [esp + 12] // width
  3294. rep stosd
  3295. mov edi, edx
  3296. ret
  3297. }
  3298. }
  3299. #endif // HAS_SETROW_X86
  3300. #ifdef HAS_YUY2TOYROW_AVX2
  3301. __declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
  3302. uint8_t* dst_y,
  3303. int width) {
  3304. __asm {
  3305. mov eax, [esp + 4] // src_yuy2
  3306. mov edx, [esp + 8] // dst_y
  3307. mov ecx, [esp + 12] // width
  3308. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3309. vpsrlw ymm5, ymm5, 8
  3310. convertloop:
  3311. vmovdqu ymm0, [eax]
  3312. vmovdqu ymm1, [eax + 32]
  3313. lea eax, [eax + 64]
  3314. vpand ymm0, ymm0, ymm5 // even bytes are Y
  3315. vpand ymm1, ymm1, ymm5
  3316. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3317. vpermq ymm0, ymm0, 0xd8
  3318. vmovdqu [edx], ymm0
  3319. lea edx, [edx + 32]
  3320. sub ecx, 32
  3321. jg convertloop
  3322. vzeroupper
  3323. ret
  3324. }
  3325. }
  3326. __declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
  3327. int stride_yuy2,
  3328. uint8_t* dst_u,
  3329. uint8_t* dst_v,
  3330. int width) {
  3331. __asm {
  3332. push esi
  3333. push edi
  3334. mov eax, [esp + 8 + 4] // src_yuy2
  3335. mov esi, [esp + 8 + 8] // stride_yuy2
  3336. mov edx, [esp + 8 + 12] // dst_u
  3337. mov edi, [esp + 8 + 16] // dst_v
  3338. mov ecx, [esp + 8 + 20] // width
  3339. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3340. vpsrlw ymm5, ymm5, 8
  3341. sub edi, edx
  3342. convertloop:
  3343. vmovdqu ymm0, [eax]
  3344. vmovdqu ymm1, [eax + 32]
  3345. vpavgb ymm0, ymm0, [eax + esi]
  3346. vpavgb ymm1, ymm1, [eax + esi + 32]
  3347. lea eax, [eax + 64]
  3348. vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
  3349. vpsrlw ymm1, ymm1, 8
  3350. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3351. vpermq ymm0, ymm0, 0xd8
  3352. vpand ymm1, ymm0, ymm5 // U
  3353. vpsrlw ymm0, ymm0, 8 // V
  3354. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3355. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3356. vpermq ymm1, ymm1, 0xd8
  3357. vpermq ymm0, ymm0, 0xd8
  3358. vextractf128 [edx], ymm1, 0 // U
  3359. vextractf128 [edx + edi], ymm0, 0 // V
  3360. lea edx, [edx + 16]
  3361. sub ecx, 32
  3362. jg convertloop
  3363. pop edi
  3364. pop esi
  3365. vzeroupper
  3366. ret
  3367. }
  3368. }
  3369. __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
  3370. uint8_t* dst_u,
  3371. uint8_t* dst_v,
  3372. int width) {
  3373. __asm {
  3374. push edi
  3375. mov eax, [esp + 4 + 4] // src_yuy2
  3376. mov edx, [esp + 4 + 8] // dst_u
  3377. mov edi, [esp + 4 + 12] // dst_v
  3378. mov ecx, [esp + 4 + 16] // width
  3379. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3380. vpsrlw ymm5, ymm5, 8
  3381. sub edi, edx
  3382. convertloop:
  3383. vmovdqu ymm0, [eax]
  3384. vmovdqu ymm1, [eax + 32]
  3385. lea eax, [eax + 64]
  3386. vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
  3387. vpsrlw ymm1, ymm1, 8
  3388. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3389. vpermq ymm0, ymm0, 0xd8
  3390. vpand ymm1, ymm0, ymm5 // U
  3391. vpsrlw ymm0, ymm0, 8 // V
  3392. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3393. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3394. vpermq ymm1, ymm1, 0xd8
  3395. vpermq ymm0, ymm0, 0xd8
  3396. vextractf128 [edx], ymm1, 0 // U
  3397. vextractf128 [edx + edi], ymm0, 0 // V
  3398. lea edx, [edx + 16]
  3399. sub ecx, 32
  3400. jg convertloop
  3401. pop edi
  3402. vzeroupper
  3403. ret
  3404. }
  3405. }
  3406. __declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
  3407. uint8_t* dst_y,
  3408. int width) {
  3409. __asm {
  3410. mov eax, [esp + 4] // src_uyvy
  3411. mov edx, [esp + 8] // dst_y
  3412. mov ecx, [esp + 12] // width
  3413. convertloop:
  3414. vmovdqu ymm0, [eax]
  3415. vmovdqu ymm1, [eax + 32]
  3416. lea eax, [eax + 64]
  3417. vpsrlw ymm0, ymm0, 8 // odd bytes are Y
  3418. vpsrlw ymm1, ymm1, 8
  3419. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3420. vpermq ymm0, ymm0, 0xd8
  3421. vmovdqu [edx], ymm0
  3422. lea edx, [edx + 32]
  3423. sub ecx, 32
  3424. jg convertloop
  3425. vzeroupper
  3426. ret
  3427. }
  3428. }
  3429. __declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
  3430. int stride_uyvy,
  3431. uint8_t* dst_u,
  3432. uint8_t* dst_v,
  3433. int width) {
  3434. __asm {
  3435. push esi
  3436. push edi
  3437. mov eax, [esp + 8 + 4] // src_yuy2
  3438. mov esi, [esp + 8 + 8] // stride_yuy2
  3439. mov edx, [esp + 8 + 12] // dst_u
  3440. mov edi, [esp + 8 + 16] // dst_v
  3441. mov ecx, [esp + 8 + 20] // width
  3442. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3443. vpsrlw ymm5, ymm5, 8
  3444. sub edi, edx
  3445. convertloop:
  3446. vmovdqu ymm0, [eax]
  3447. vmovdqu ymm1, [eax + 32]
  3448. vpavgb ymm0, ymm0, [eax + esi]
  3449. vpavgb ymm1, ymm1, [eax + esi + 32]
  3450. lea eax, [eax + 64]
  3451. vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
  3452. vpand ymm1, ymm1, ymm5
  3453. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3454. vpermq ymm0, ymm0, 0xd8
  3455. vpand ymm1, ymm0, ymm5 // U
  3456. vpsrlw ymm0, ymm0, 8 // V
  3457. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3458. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3459. vpermq ymm1, ymm1, 0xd8
  3460. vpermq ymm0, ymm0, 0xd8
  3461. vextractf128 [edx], ymm1, 0 // U
  3462. vextractf128 [edx + edi], ymm0, 0 // V
  3463. lea edx, [edx + 16]
  3464. sub ecx, 32
  3465. jg convertloop
  3466. pop edi
  3467. pop esi
  3468. vzeroupper
  3469. ret
  3470. }
  3471. }
  3472. __declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
  3473. uint8_t* dst_u,
  3474. uint8_t* dst_v,
  3475. int width) {
  3476. __asm {
  3477. push edi
  3478. mov eax, [esp + 4 + 4] // src_yuy2
  3479. mov edx, [esp + 4 + 8] // dst_u
  3480. mov edi, [esp + 4 + 12] // dst_v
  3481. mov ecx, [esp + 4 + 16] // width
  3482. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
  3483. vpsrlw ymm5, ymm5, 8
  3484. sub edi, edx
  3485. convertloop:
  3486. vmovdqu ymm0, [eax]
  3487. vmovdqu ymm1, [eax + 32]
  3488. lea eax, [eax + 64]
  3489. vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
  3490. vpand ymm1, ymm1, ymm5
  3491. vpackuswb ymm0, ymm0, ymm1 // mutates.
  3492. vpermq ymm0, ymm0, 0xd8
  3493. vpand ymm1, ymm0, ymm5 // U
  3494. vpsrlw ymm0, ymm0, 8 // V
  3495. vpackuswb ymm1, ymm1, ymm1 // mutates.
  3496. vpackuswb ymm0, ymm0, ymm0 // mutates.
  3497. vpermq ymm1, ymm1, 0xd8
  3498. vpermq ymm0, ymm0, 0xd8
  3499. vextractf128 [edx], ymm1, 0 // U
  3500. vextractf128 [edx + edi], ymm0, 0 // V
  3501. lea edx, [edx + 16]
  3502. sub ecx, 32
  3503. jg convertloop
  3504. pop edi
  3505. vzeroupper
  3506. ret
  3507. }
  3508. }
  3509. #endif // HAS_YUY2TOYROW_AVX2
  3510. #ifdef HAS_YUY2TOYROW_SSE2
  3511. __declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
  3512. uint8_t* dst_y,
  3513. int width) {
  3514. __asm {
  3515. mov eax, [esp + 4] // src_yuy2
  3516. mov edx, [esp + 8] // dst_y
  3517. mov ecx, [esp + 12] // width
  3518. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3519. psrlw xmm5, 8
  3520. convertloop:
  3521. movdqu xmm0, [eax]
  3522. movdqu xmm1, [eax + 16]
  3523. lea eax, [eax + 32]
  3524. pand xmm0, xmm5 // even bytes are Y
  3525. pand xmm1, xmm5
  3526. packuswb xmm0, xmm1
  3527. movdqu [edx], xmm0
  3528. lea edx, [edx + 16]
  3529. sub ecx, 16
  3530. jg convertloop
  3531. ret
  3532. }
  3533. }
  3534. __declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
  3535. int stride_yuy2,
  3536. uint8_t* dst_u,
  3537. uint8_t* dst_v,
  3538. int width) {
  3539. __asm {
  3540. push esi
  3541. push edi
  3542. mov eax, [esp + 8 + 4] // src_yuy2
  3543. mov esi, [esp + 8 + 8] // stride_yuy2
  3544. mov edx, [esp + 8 + 12] // dst_u
  3545. mov edi, [esp + 8 + 16] // dst_v
  3546. mov ecx, [esp + 8 + 20] // width
  3547. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3548. psrlw xmm5, 8
  3549. sub edi, edx
  3550. convertloop:
  3551. movdqu xmm0, [eax]
  3552. movdqu xmm1, [eax + 16]
  3553. movdqu xmm2, [eax + esi]
  3554. movdqu xmm3, [eax + esi + 16]
  3555. lea eax, [eax + 32]
  3556. pavgb xmm0, xmm2
  3557. pavgb xmm1, xmm3
  3558. psrlw xmm0, 8 // YUYV -> UVUV
  3559. psrlw xmm1, 8
  3560. packuswb xmm0, xmm1
  3561. movdqa xmm1, xmm0
  3562. pand xmm0, xmm5 // U
  3563. packuswb xmm0, xmm0
  3564. psrlw xmm1, 8 // V
  3565. packuswb xmm1, xmm1
  3566. movq qword ptr [edx], xmm0
  3567. movq qword ptr [edx + edi], xmm1
  3568. lea edx, [edx + 8]
  3569. sub ecx, 16
  3570. jg convertloop
  3571. pop edi
  3572. pop esi
  3573. ret
  3574. }
  3575. }
  3576. __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
  3577. uint8_t* dst_u,
  3578. uint8_t* dst_v,
  3579. int width) {
  3580. __asm {
  3581. push edi
  3582. mov eax, [esp + 4 + 4] // src_yuy2
  3583. mov edx, [esp + 4 + 8] // dst_u
  3584. mov edi, [esp + 4 + 12] // dst_v
  3585. mov ecx, [esp + 4 + 16] // width
  3586. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3587. psrlw xmm5, 8
  3588. sub edi, edx
  3589. convertloop:
  3590. movdqu xmm0, [eax]
  3591. movdqu xmm1, [eax + 16]
  3592. lea eax, [eax + 32]
  3593. psrlw xmm0, 8 // YUYV -> UVUV
  3594. psrlw xmm1, 8
  3595. packuswb xmm0, xmm1
  3596. movdqa xmm1, xmm0
  3597. pand xmm0, xmm5 // U
  3598. packuswb xmm0, xmm0
  3599. psrlw xmm1, 8 // V
  3600. packuswb xmm1, xmm1
  3601. movq qword ptr [edx], xmm0
  3602. movq qword ptr [edx + edi], xmm1
  3603. lea edx, [edx + 8]
  3604. sub ecx, 16
  3605. jg convertloop
  3606. pop edi
  3607. ret
  3608. }
  3609. }
  3610. __declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
  3611. uint8_t* dst_y,
  3612. int width) {
  3613. __asm {
  3614. mov eax, [esp + 4] // src_uyvy
  3615. mov edx, [esp + 8] // dst_y
  3616. mov ecx, [esp + 12] // width
  3617. convertloop:
  3618. movdqu xmm0, [eax]
  3619. movdqu xmm1, [eax + 16]
  3620. lea eax, [eax + 32]
  3621. psrlw xmm0, 8 // odd bytes are Y
  3622. psrlw xmm1, 8
  3623. packuswb xmm0, xmm1
  3624. movdqu [edx], xmm0
  3625. lea edx, [edx + 16]
  3626. sub ecx, 16
  3627. jg convertloop
  3628. ret
  3629. }
  3630. }
  3631. __declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
  3632. int stride_uyvy,
  3633. uint8_t* dst_u,
  3634. uint8_t* dst_v,
  3635. int width) {
  3636. __asm {
  3637. push esi
  3638. push edi
  3639. mov eax, [esp + 8 + 4] // src_yuy2
  3640. mov esi, [esp + 8 + 8] // stride_yuy2
  3641. mov edx, [esp + 8 + 12] // dst_u
  3642. mov edi, [esp + 8 + 16] // dst_v
  3643. mov ecx, [esp + 8 + 20] // width
  3644. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3645. psrlw xmm5, 8
  3646. sub edi, edx
  3647. convertloop:
  3648. movdqu xmm0, [eax]
  3649. movdqu xmm1, [eax + 16]
  3650. movdqu xmm2, [eax + esi]
  3651. movdqu xmm3, [eax + esi + 16]
  3652. lea eax, [eax + 32]
  3653. pavgb xmm0, xmm2
  3654. pavgb xmm1, xmm3
  3655. pand xmm0, xmm5 // UYVY -> UVUV
  3656. pand xmm1, xmm5
  3657. packuswb xmm0, xmm1
  3658. movdqa xmm1, xmm0
  3659. pand xmm0, xmm5 // U
  3660. packuswb xmm0, xmm0
  3661. psrlw xmm1, 8 // V
  3662. packuswb xmm1, xmm1
  3663. movq qword ptr [edx], xmm0
  3664. movq qword ptr [edx + edi], xmm1
  3665. lea edx, [edx + 8]
  3666. sub ecx, 16
  3667. jg convertloop
  3668. pop edi
  3669. pop esi
  3670. ret
  3671. }
  3672. }
  3673. __declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
  3674. uint8_t* dst_u,
  3675. uint8_t* dst_v,
  3676. int width) {
  3677. __asm {
  3678. push edi
  3679. mov eax, [esp + 4 + 4] // src_yuy2
  3680. mov edx, [esp + 4 + 8] // dst_u
  3681. mov edi, [esp + 4 + 12] // dst_v
  3682. mov ecx, [esp + 4 + 16] // width
  3683. pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
  3684. psrlw xmm5, 8
  3685. sub edi, edx
  3686. convertloop:
  3687. movdqu xmm0, [eax]
  3688. movdqu xmm1, [eax + 16]
  3689. lea eax, [eax + 32]
  3690. pand xmm0, xmm5 // UYVY -> UVUV
  3691. pand xmm1, xmm5
  3692. packuswb xmm0, xmm1
  3693. movdqa xmm1, xmm0
  3694. pand xmm0, xmm5 // U
  3695. packuswb xmm0, xmm0
  3696. psrlw xmm1, 8 // V
  3697. packuswb xmm1, xmm1
  3698. movq qword ptr [edx], xmm0
  3699. movq qword ptr [edx + edi], xmm1
  3700. lea edx, [edx + 8]
  3701. sub ecx, 16
  3702. jg convertloop
  3703. pop edi
  3704. ret
  3705. }
  3706. }
  3707. #endif // HAS_YUY2TOYROW_SSE2
  3708. #ifdef HAS_BLENDPLANEROW_SSSE3
  3709. // Blend 8 pixels at a time.
  3710. // unsigned version of math
  3711. // =((A2*C2)+(B2*(255-C2))+255)/256
  3712. // signed version of math
  3713. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  3714. __declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
  3715. const uint8_t* src1,
  3716. const uint8_t* alpha,
  3717. uint8_t* dst,
  3718. int width) {
  3719. __asm {
  3720. push esi
  3721. push edi
  3722. pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
  3723. psllw xmm5, 8
  3724. mov eax, 0x80808080 // 128 for biasing image to signed.
  3725. movd xmm6, eax
  3726. pshufd xmm6, xmm6, 0x00
  3727. mov eax, 0x807f807f // 32768 + 127 for unbias and round.
  3728. movd xmm7, eax
  3729. pshufd xmm7, xmm7, 0x00
  3730. mov eax, [esp + 8 + 4] // src0
  3731. mov edx, [esp + 8 + 8] // src1
  3732. mov esi, [esp + 8 + 12] // alpha
  3733. mov edi, [esp + 8 + 16] // dst
  3734. mov ecx, [esp + 8 + 20] // width
  3735. sub eax, esi
  3736. sub edx, esi
  3737. sub edi, esi
  3738. // 8 pixel loop.
  3739. convertloop8:
  3740. movq xmm0, qword ptr [esi] // alpha
  3741. punpcklbw xmm0, xmm0
  3742. pxor xmm0, xmm5 // a, 255-a
  3743. movq xmm1, qword ptr [eax + esi] // src0
  3744. movq xmm2, qword ptr [edx + esi] // src1
  3745. punpcklbw xmm1, xmm2
  3746. psubb xmm1, xmm6 // bias src0/1 - 128
  3747. pmaddubsw xmm0, xmm1
  3748. paddw xmm0, xmm7 // unbias result - 32768 and round.
  3749. psrlw xmm0, 8
  3750. packuswb xmm0, xmm0
  3751. movq qword ptr [edi + esi], xmm0
  3752. lea esi, [esi + 8]
  3753. sub ecx, 8
  3754. jg convertloop8
  3755. pop edi
  3756. pop esi
  3757. ret
  3758. }
  3759. }
  3760. #endif // HAS_BLENDPLANEROW_SSSE3
  3761. #ifdef HAS_BLENDPLANEROW_AVX2
  3762. // Blend 32 pixels at a time.
  3763. // unsigned version of math
  3764. // =((A2*C2)+(B2*(255-C2))+255)/256
  3765. // signed version of math
  3766. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  3767. __declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
  3768. const uint8_t* src1,
  3769. const uint8_t* alpha,
  3770. uint8_t* dst,
  3771. int width) {
  3772. __asm {
  3773. push esi
  3774. push edi
  3775. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
  3776. vpsllw ymm5, ymm5, 8
  3777. mov eax, 0x80808080 // 128 for biasing image to signed.
  3778. vmovd xmm6, eax
  3779. vbroadcastss ymm6, xmm6
  3780. mov eax, 0x807f807f // 32768 + 127 for unbias and round.
  3781. vmovd xmm7, eax
  3782. vbroadcastss ymm7, xmm7
  3783. mov eax, [esp + 8 + 4] // src0
  3784. mov edx, [esp + 8 + 8] // src1
  3785. mov esi, [esp + 8 + 12] // alpha
  3786. mov edi, [esp + 8 + 16] // dst
  3787. mov ecx, [esp + 8 + 20] // width
  3788. sub eax, esi
  3789. sub edx, esi
  3790. sub edi, esi
  3791. // 32 pixel loop.
  3792. convertloop32:
  3793. vmovdqu ymm0, [esi] // alpha
  3794. vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
  3795. vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
  3796. vpxor ymm3, ymm3, ymm5 // a, 255-a
  3797. vpxor ymm0, ymm0, ymm5 // a, 255-a
  3798. vmovdqu ymm1, [eax + esi] // src0
  3799. vmovdqu ymm2, [edx + esi] // src1
  3800. vpunpckhbw ymm4, ymm1, ymm2
  3801. vpunpcklbw ymm1, ymm1, ymm2
  3802. vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
  3803. vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
  3804. vpmaddubsw ymm3, ymm3, ymm4
  3805. vpmaddubsw ymm0, ymm0, ymm1
  3806. vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
  3807. vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
  3808. vpsrlw ymm3, ymm3, 8
  3809. vpsrlw ymm0, ymm0, 8
  3810. vpackuswb ymm0, ymm0, ymm3
  3811. vmovdqu [edi + esi], ymm0
  3812. lea esi, [esi + 32]
  3813. sub ecx, 32
  3814. jg convertloop32
  3815. pop edi
  3816. pop esi
  3817. vzeroupper
  3818. ret
  3819. }
  3820. }
  3821. #endif // HAS_BLENDPLANEROW_AVX2
  3822. #ifdef HAS_ARGBBLENDROW_SSSE3
  3823. // Shuffle table for isolating alpha.
  3824. static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
  3825. 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
  3826. // Blend 8 pixels at a time.
  3827. __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
  3828. const uint8_t* src_argb1,
  3829. uint8_t* dst_argb,
  3830. int width) {
  3831. __asm {
  3832. push esi
  3833. mov eax, [esp + 4 + 4] // src_argb0
  3834. mov esi, [esp + 4 + 8] // src_argb1
  3835. mov edx, [esp + 4 + 12] // dst_argb
  3836. mov ecx, [esp + 4 + 16] // width
  3837. pcmpeqb xmm7, xmm7 // generate constant 0x0001
  3838. psrlw xmm7, 15
  3839. pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
  3840. psrlw xmm6, 8
  3841. pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
  3842. psllw xmm5, 8
  3843. pcmpeqb xmm4, xmm4 // generate mask 0xff000000
  3844. pslld xmm4, 24
  3845. sub ecx, 4
  3846. jl convertloop4b // less than 4 pixels?
  3847. // 4 pixel loop.
  3848. convertloop4:
  3849. movdqu xmm3, [eax] // src argb
  3850. lea eax, [eax + 16]
  3851. movdqa xmm0, xmm3 // src argb
  3852. pxor xmm3, xmm4 // ~alpha
  3853. movdqu xmm2, [esi] // _r_b
  3854. pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
  3855. pand xmm2, xmm6 // _r_b
  3856. paddw xmm3, xmm7 // 256 - alpha
  3857. pmullw xmm2, xmm3 // _r_b * alpha
  3858. movdqu xmm1, [esi] // _a_g
  3859. lea esi, [esi + 16]
  3860. psrlw xmm1, 8 // _a_g
  3861. por xmm0, xmm4 // set alpha to 255
  3862. pmullw xmm1, xmm3 // _a_g * alpha
  3863. psrlw xmm2, 8 // _r_b convert to 8 bits again
  3864. paddusb xmm0, xmm2 // + src argb
  3865. pand xmm1, xmm5 // a_g_ convert to 8 bits again
  3866. paddusb xmm0, xmm1 // + src argb
  3867. movdqu [edx], xmm0
  3868. lea edx, [edx + 16]
  3869. sub ecx, 4
  3870. jge convertloop4
  3871. convertloop4b:
  3872. add ecx, 4 - 1
  3873. jl convertloop1b
  3874. // 1 pixel loop.
  3875. convertloop1:
  3876. movd xmm3, [eax] // src argb
  3877. lea eax, [eax + 4]
  3878. movdqa xmm0, xmm3 // src argb
  3879. pxor xmm3, xmm4 // ~alpha
  3880. movd xmm2, [esi] // _r_b
  3881. pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
  3882. pand xmm2, xmm6 // _r_b
  3883. paddw xmm3, xmm7 // 256 - alpha
  3884. pmullw xmm2, xmm3 // _r_b * alpha
  3885. movd xmm1, [esi] // _a_g
  3886. lea esi, [esi + 4]
  3887. psrlw xmm1, 8 // _a_g
  3888. por xmm0, xmm4 // set alpha to 255
  3889. pmullw xmm1, xmm3 // _a_g * alpha
  3890. psrlw xmm2, 8 // _r_b convert to 8 bits again
  3891. paddusb xmm0, xmm2 // + src argb
  3892. pand xmm1, xmm5 // a_g_ convert to 8 bits again
  3893. paddusb xmm0, xmm1 // + src argb
  3894. movd [edx], xmm0
  3895. lea edx, [edx + 4]
  3896. sub ecx, 1
  3897. jge convertloop1
  3898. convertloop1b:
  3899. pop esi
  3900. ret
  3901. }
  3902. }
  3903. #endif // HAS_ARGBBLENDROW_SSSE3
  3904. #ifdef HAS_ARGBATTENUATEROW_SSSE3
  3905. // Shuffle table duplicating alpha.
  3906. static const uvec8 kShuffleAlpha0 = {
  3907. 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
  3908. };
  3909. static const uvec8 kShuffleAlpha1 = {
  3910. 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
  3911. 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
  3912. };
  3913. __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
  3914. uint8_t* dst_argb,
  3915. int width) {
  3916. __asm {
  3917. mov eax, [esp + 4] // src_argb0
  3918. mov edx, [esp + 8] // dst_argb
  3919. mov ecx, [esp + 12] // width
  3920. pcmpeqb xmm3, xmm3 // generate mask 0xff000000
  3921. pslld xmm3, 24
  3922. movdqa xmm4, xmmword ptr kShuffleAlpha0
  3923. movdqa xmm5, xmmword ptr kShuffleAlpha1
  3924. convertloop:
  3925. movdqu xmm0, [eax] // read 4 pixels
  3926. pshufb xmm0, xmm4 // isolate first 2 alphas
  3927. movdqu xmm1, [eax] // read 4 pixels
  3928. punpcklbw xmm1, xmm1 // first 2 pixel rgbs
  3929. pmulhuw xmm0, xmm1 // rgb * a
  3930. movdqu xmm1, [eax] // read 4 pixels
  3931. pshufb xmm1, xmm5 // isolate next 2 alphas
  3932. movdqu xmm2, [eax] // read 4 pixels
  3933. punpckhbw xmm2, xmm2 // next 2 pixel rgbs
  3934. pmulhuw xmm1, xmm2 // rgb * a
  3935. movdqu xmm2, [eax] // mask original alpha
  3936. lea eax, [eax + 16]
  3937. pand xmm2, xmm3
  3938. psrlw xmm0, 8
  3939. psrlw xmm1, 8
  3940. packuswb xmm0, xmm1
  3941. por xmm0, xmm2 // copy original alpha
  3942. movdqu [edx], xmm0
  3943. lea edx, [edx + 16]
  3944. sub ecx, 4
  3945. jg convertloop
  3946. ret
  3947. }
  3948. }
  3949. #endif // HAS_ARGBATTENUATEROW_SSSE3
  3950. #ifdef HAS_ARGBATTENUATEROW_AVX2
  3951. // Shuffle table duplicating alpha.
  3952. static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
  3953. 128u, 128u, 14u, 15u, 14u, 15u,
  3954. 14u, 15u, 128u, 128u};
  3955. __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
  3956. uint8_t* dst_argb,
  3957. int width) {
  3958. __asm {
  3959. mov eax, [esp + 4] // src_argb0
  3960. mov edx, [esp + 8] // dst_argb
  3961. mov ecx, [esp + 12] // width
  3962. sub edx, eax
  3963. vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
  3964. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
  3965. vpslld ymm5, ymm5, 24
  3966. convertloop:
  3967. vmovdqu ymm6, [eax] // read 8 pixels.
  3968. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
  3969. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
  3970. vpshufb ymm2, ymm0, ymm4 // low 4 alphas
  3971. vpshufb ymm3, ymm1, ymm4 // high 4 alphas
  3972. vpmulhuw ymm0, ymm0, ymm2 // rgb * a
  3973. vpmulhuw ymm1, ymm1, ymm3 // rgb * a
  3974. vpand ymm6, ymm6, ymm5 // isolate alpha
  3975. vpsrlw ymm0, ymm0, 8
  3976. vpsrlw ymm1, ymm1, 8
  3977. vpackuswb ymm0, ymm0, ymm1 // unmutated.
  3978. vpor ymm0, ymm0, ymm6 // copy original alpha
  3979. vmovdqu [eax + edx], ymm0
  3980. lea eax, [eax + 32]
  3981. sub ecx, 8
  3982. jg convertloop
  3983. vzeroupper
  3984. ret
  3985. }
  3986. }
  3987. #endif // HAS_ARGBATTENUATEROW_AVX2
  3988. #ifdef HAS_ARGBUNATTENUATEROW_SSE2
  3989. // Unattenuate 4 pixels at a time.
  3990. __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
  3991. uint8_t* dst_argb,
  3992. int width) {
  3993. __asm {
  3994. push ebx
  3995. push esi
  3996. push edi
  3997. mov eax, [esp + 12 + 4] // src_argb
  3998. mov edx, [esp + 12 + 8] // dst_argb
  3999. mov ecx, [esp + 12 + 12] // width
  4000. lea ebx, fixed_invtbl8
  4001. convertloop:
  4002. movdqu xmm0, [eax] // read 4 pixels
  4003. movzx esi, byte ptr [eax + 3] // first alpha
  4004. movzx edi, byte ptr [eax + 7] // second alpha
  4005. punpcklbw xmm0, xmm0 // first 2
  4006. movd xmm2, dword ptr [ebx + esi * 4]
  4007. movd xmm3, dword ptr [ebx + edi * 4]
  4008. pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
  4009. pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
  4010. movlhps xmm2, xmm3
  4011. pmulhuw xmm0, xmm2 // rgb * a
  4012. movdqu xmm1, [eax] // read 4 pixels
  4013. movzx esi, byte ptr [eax + 11] // third alpha
  4014. movzx edi, byte ptr [eax + 15] // forth alpha
  4015. punpckhbw xmm1, xmm1 // next 2
  4016. movd xmm2, dword ptr [ebx + esi * 4]
  4017. movd xmm3, dword ptr [ebx + edi * 4]
  4018. pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
  4019. pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
  4020. movlhps xmm2, xmm3
  4021. pmulhuw xmm1, xmm2 // rgb * a
  4022. lea eax, [eax + 16]
  4023. packuswb xmm0, xmm1
  4024. movdqu [edx], xmm0
  4025. lea edx, [edx + 16]
  4026. sub ecx, 4
  4027. jg convertloop
  4028. pop edi
  4029. pop esi
  4030. pop ebx
  4031. ret
  4032. }
  4033. }
  4034. #endif // HAS_ARGBUNATTENUATEROW_SSE2
  4035. #ifdef HAS_ARGBUNATTENUATEROW_AVX2
  4036. // Shuffle table duplicating alpha.
  4037. static const uvec8 kUnattenShuffleAlpha_AVX2 = {
  4038. 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
  4039. // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
  4040. // USE_GATHER is not on by default, due to being a slow instruction.
  4041. #ifdef USE_GATHER
  4042. __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
  4043. uint8_t* dst_argb,
  4044. int width) {
  4045. __asm {
  4046. mov eax, [esp + 4] // src_argb0
  4047. mov edx, [esp + 8] // dst_argb
  4048. mov ecx, [esp + 12] // width
  4049. sub edx, eax
  4050. vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
  4051. convertloop:
  4052. vmovdqu ymm6, [eax] // read 8 pixels.
  4053. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
  4054. vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
  4055. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
  4056. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
  4057. vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
  4058. vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
  4059. vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
  4060. vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
  4061. vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
  4062. vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
  4063. vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
  4064. vpackuswb ymm0, ymm0, ymm1 // unmutated.
  4065. vmovdqu [eax + edx], ymm0
  4066. lea eax, [eax + 32]
  4067. sub ecx, 8
  4068. jg convertloop
  4069. vzeroupper
  4070. ret
  4071. }
  4072. }
  4073. #else // USE_GATHER
  4074. __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
  4075. uint8_t* dst_argb,
  4076. int width) {
  4077. __asm {
  4078. push ebx
  4079. push esi
  4080. push edi
  4081. mov eax, [esp + 12 + 4] // src_argb
  4082. mov edx, [esp + 12 + 8] // dst_argb
  4083. mov ecx, [esp + 12 + 12] // width
  4084. sub edx, eax
  4085. lea ebx, fixed_invtbl8
  4086. vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
  4087. convertloop:
  4088. // replace VPGATHER
  4089. movzx esi, byte ptr [eax + 3] // alpha0
  4090. movzx edi, byte ptr [eax + 7] // alpha1
  4091. vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
  4092. vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
  4093. movzx esi, byte ptr [eax + 11] // alpha2
  4094. movzx edi, byte ptr [eax + 15] // alpha3
  4095. vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
  4096. vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
  4097. vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
  4098. movzx esi, byte ptr [eax + 19] // alpha4
  4099. movzx edi, byte ptr [eax + 23] // alpha5
  4100. vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
  4101. vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
  4102. vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
  4103. movzx esi, byte ptr [eax + 27] // alpha6
  4104. movzx edi, byte ptr [eax + 31] // alpha7
  4105. vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
  4106. vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
  4107. vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
  4108. vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
  4109. vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
  4110. vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
  4111. vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
  4112. // end of VPGATHER
  4113. vmovdqu ymm6, [eax] // read 8 pixels.
  4114. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
  4115. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
  4116. vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
  4117. vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
  4118. vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
  4119. vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
  4120. vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
  4121. vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
  4122. vpackuswb ymm0, ymm0, ymm1 // unmutated.
  4123. vmovdqu [eax + edx], ymm0
  4124. lea eax, [eax + 32]
  4125. sub ecx, 8
  4126. jg convertloop
  4127. pop edi
  4128. pop esi
  4129. pop ebx
  4130. vzeroupper
  4131. ret
  4132. }
  4133. }
  4134. #endif // USE_GATHER
  4135. #endif // HAS_ARGBATTENUATEROW_AVX2
  4136. #ifdef HAS_ARGBGRAYROW_SSSE3
  4137. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
  4138. __declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
  4139. uint8_t* dst_argb,
  4140. int width) {
  4141. __asm {
  4142. mov eax, [esp + 4] /* src_argb */
  4143. mov edx, [esp + 8] /* dst_argb */
  4144. mov ecx, [esp + 12] /* width */
  4145. movdqa xmm4, xmmword ptr kARGBToYJ
  4146. movdqa xmm5, xmmword ptr kAddYJ64
  4147. convertloop:
  4148. movdqu xmm0, [eax] // G
  4149. movdqu xmm1, [eax + 16]
  4150. pmaddubsw xmm0, xmm4
  4151. pmaddubsw xmm1, xmm4
  4152. phaddw xmm0, xmm1
  4153. paddw xmm0, xmm5 // Add .5 for rounding.
  4154. psrlw xmm0, 7
  4155. packuswb xmm0, xmm0 // 8 G bytes
  4156. movdqu xmm2, [eax] // A
  4157. movdqu xmm3, [eax + 16]
  4158. lea eax, [eax + 32]
  4159. psrld xmm2, 24
  4160. psrld xmm3, 24
  4161. packuswb xmm2, xmm3
  4162. packuswb xmm2, xmm2 // 8 A bytes
  4163. movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
  4164. punpcklbw xmm0, xmm0 // 8 GG words
  4165. punpcklbw xmm3, xmm2 // 8 GA words
  4166. movdqa xmm1, xmm0
  4167. punpcklwd xmm0, xmm3 // GGGA first 4
  4168. punpckhwd xmm1, xmm3 // GGGA next 4
  4169. movdqu [edx], xmm0
  4170. movdqu [edx + 16], xmm1
  4171. lea edx, [edx + 32]
  4172. sub ecx, 8
  4173. jg convertloop
  4174. ret
  4175. }
  4176. }
  4177. #endif // HAS_ARGBGRAYROW_SSSE3
  4178. #ifdef HAS_ARGBSEPIAROW_SSSE3
  4179. // b = (r * 35 + g * 68 + b * 17) >> 7
  4180. // g = (r * 45 + g * 88 + b * 22) >> 7
  4181. // r = (r * 50 + g * 98 + b * 24) >> 7
  4182. // Constant for ARGB color to sepia tone.
  4183. static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
  4184. 17, 68, 35, 0, 17, 68, 35, 0};
  4185. static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
  4186. 22, 88, 45, 0, 22, 88, 45, 0};
  4187. static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
  4188. 24, 98, 50, 0, 24, 98, 50, 0};
  4189. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  4190. __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
  4191. __asm {
  4192. mov eax, [esp + 4] /* dst_argb */
  4193. mov ecx, [esp + 8] /* width */
  4194. movdqa xmm2, xmmword ptr kARGBToSepiaB
  4195. movdqa xmm3, xmmword ptr kARGBToSepiaG
  4196. movdqa xmm4, xmmword ptr kARGBToSepiaR
  4197. convertloop:
  4198. movdqu xmm0, [eax] // B
  4199. movdqu xmm6, [eax + 16]
  4200. pmaddubsw xmm0, xmm2
  4201. pmaddubsw xmm6, xmm2
  4202. phaddw xmm0, xmm6
  4203. psrlw xmm0, 7
  4204. packuswb xmm0, xmm0 // 8 B values
  4205. movdqu xmm5, [eax] // G
  4206. movdqu xmm1, [eax + 16]
  4207. pmaddubsw xmm5, xmm3
  4208. pmaddubsw xmm1, xmm3
  4209. phaddw xmm5, xmm1
  4210. psrlw xmm5, 7
  4211. packuswb xmm5, xmm5 // 8 G values
  4212. punpcklbw xmm0, xmm5 // 8 BG values
  4213. movdqu xmm5, [eax] // R
  4214. movdqu xmm1, [eax + 16]
  4215. pmaddubsw xmm5, xmm4
  4216. pmaddubsw xmm1, xmm4
  4217. phaddw xmm5, xmm1
  4218. psrlw xmm5, 7
  4219. packuswb xmm5, xmm5 // 8 R values
  4220. movdqu xmm6, [eax] // A
  4221. movdqu xmm1, [eax + 16]
  4222. psrld xmm6, 24
  4223. psrld xmm1, 24
  4224. packuswb xmm6, xmm1
  4225. packuswb xmm6, xmm6 // 8 A values
  4226. punpcklbw xmm5, xmm6 // 8 RA values
  4227. movdqa xmm1, xmm0 // Weave BG, RA together
  4228. punpcklwd xmm0, xmm5 // BGRA first 4
  4229. punpckhwd xmm1, xmm5 // BGRA next 4
  4230. movdqu [eax], xmm0
  4231. movdqu [eax + 16], xmm1
  4232. lea eax, [eax + 32]
  4233. sub ecx, 8
  4234. jg convertloop
  4235. ret
  4236. }
  4237. }
  4238. #endif // HAS_ARGBSEPIAROW_SSSE3
  4239. #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
  4240. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  4241. // Same as Sepia except matrix is provided.
  4242. // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
  4243. // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
  4244. __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
  4245. uint8_t* dst_argb,
  4246. const int8_t* matrix_argb,
  4247. int width) {
  4248. __asm {
  4249. mov eax, [esp + 4] /* src_argb */
  4250. mov edx, [esp + 8] /* dst_argb */
  4251. mov ecx, [esp + 12] /* matrix_argb */
  4252. movdqu xmm5, [ecx]
  4253. pshufd xmm2, xmm5, 0x00
  4254. pshufd xmm3, xmm5, 0x55
  4255. pshufd xmm4, xmm5, 0xaa
  4256. pshufd xmm5, xmm5, 0xff
  4257. mov ecx, [esp + 16] /* width */
  4258. convertloop:
  4259. movdqu xmm0, [eax] // B
  4260. movdqu xmm7, [eax + 16]
  4261. pmaddubsw xmm0, xmm2
  4262. pmaddubsw xmm7, xmm2
  4263. movdqu xmm6, [eax] // G
  4264. movdqu xmm1, [eax + 16]
  4265. pmaddubsw xmm6, xmm3
  4266. pmaddubsw xmm1, xmm3
  4267. phaddsw xmm0, xmm7 // B
  4268. phaddsw xmm6, xmm1 // G
  4269. psraw xmm0, 6 // B
  4270. psraw xmm6, 6 // G
  4271. packuswb xmm0, xmm0 // 8 B values
  4272. packuswb xmm6, xmm6 // 8 G values
  4273. punpcklbw xmm0, xmm6 // 8 BG values
  4274. movdqu xmm1, [eax] // R
  4275. movdqu xmm7, [eax + 16]
  4276. pmaddubsw xmm1, xmm4
  4277. pmaddubsw xmm7, xmm4
  4278. phaddsw xmm1, xmm7 // R
  4279. movdqu xmm6, [eax] // A
  4280. movdqu xmm7, [eax + 16]
  4281. pmaddubsw xmm6, xmm5
  4282. pmaddubsw xmm7, xmm5
  4283. phaddsw xmm6, xmm7 // A
  4284. psraw xmm1, 6 // R
  4285. psraw xmm6, 6 // A
  4286. packuswb xmm1, xmm1 // 8 R values
  4287. packuswb xmm6, xmm6 // 8 A values
  4288. punpcklbw xmm1, xmm6 // 8 RA values
  4289. movdqa xmm6, xmm0 // Weave BG, RA together
  4290. punpcklwd xmm0, xmm1 // BGRA first 4
  4291. punpckhwd xmm6, xmm1 // BGRA next 4
  4292. movdqu [edx], xmm0
  4293. movdqu [edx + 16], xmm6
  4294. lea eax, [eax + 32]
  4295. lea edx, [edx + 32]
  4296. sub ecx, 8
  4297. jg convertloop
  4298. ret
  4299. }
  4300. }
  4301. #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
  4302. #ifdef HAS_ARGBQUANTIZEROW_SSE2
  4303. // Quantize 4 ARGB pixels (16 bytes).
  4304. __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
  4305. int scale,
  4306. int interval_size,
  4307. int interval_offset,
  4308. int width) {
  4309. __asm {
  4310. mov eax, [esp + 4] /* dst_argb */
  4311. movd xmm2, [esp + 8] /* scale */
  4312. movd xmm3, [esp + 12] /* interval_size */
  4313. movd xmm4, [esp + 16] /* interval_offset */
  4314. mov ecx, [esp + 20] /* width */
  4315. pshuflw xmm2, xmm2, 040h
  4316. pshufd xmm2, xmm2, 044h
  4317. pshuflw xmm3, xmm3, 040h
  4318. pshufd xmm3, xmm3, 044h
  4319. pshuflw xmm4, xmm4, 040h
  4320. pshufd xmm4, xmm4, 044h
  4321. pxor xmm5, xmm5 // constant 0
  4322. pcmpeqb xmm6, xmm6 // generate mask 0xff000000
  4323. pslld xmm6, 24
  4324. convertloop:
  4325. movdqu xmm0, [eax] // read 4 pixels
  4326. punpcklbw xmm0, xmm5 // first 2 pixels
  4327. pmulhuw xmm0, xmm2 // pixel * scale >> 16
  4328. movdqu xmm1, [eax] // read 4 pixels
  4329. punpckhbw xmm1, xmm5 // next 2 pixels
  4330. pmulhuw xmm1, xmm2
  4331. pmullw xmm0, xmm3 // * interval_size
  4332. movdqu xmm7, [eax] // read 4 pixels
  4333. pmullw xmm1, xmm3
  4334. pand xmm7, xmm6 // mask alpha
  4335. paddw xmm0, xmm4 // + interval_size / 2
  4336. paddw xmm1, xmm4
  4337. packuswb xmm0, xmm1
  4338. por xmm0, xmm7
  4339. movdqu [eax], xmm0
  4340. lea eax, [eax + 16]
  4341. sub ecx, 4
  4342. jg convertloop
  4343. ret
  4344. }
  4345. }
  4346. #endif // HAS_ARGBQUANTIZEROW_SSE2
  4347. #ifdef HAS_ARGBSHADEROW_SSE2
  4348. // Shade 4 pixels at a time by specified value.
  4349. __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
  4350. uint8_t* dst_argb,
  4351. int width,
  4352. uint32_t value) {
  4353. __asm {
  4354. mov eax, [esp + 4] // src_argb
  4355. mov edx, [esp + 8] // dst_argb
  4356. mov ecx, [esp + 12] // width
  4357. movd xmm2, [esp + 16] // value
  4358. punpcklbw xmm2, xmm2
  4359. punpcklqdq xmm2, xmm2
  4360. convertloop:
  4361. movdqu xmm0, [eax] // read 4 pixels
  4362. lea eax, [eax + 16]
  4363. movdqa xmm1, xmm0
  4364. punpcklbw xmm0, xmm0 // first 2
  4365. punpckhbw xmm1, xmm1 // next 2
  4366. pmulhuw xmm0, xmm2 // argb * value
  4367. pmulhuw xmm1, xmm2 // argb * value
  4368. psrlw xmm0, 8
  4369. psrlw xmm1, 8
  4370. packuswb xmm0, xmm1
  4371. movdqu [edx], xmm0
  4372. lea edx, [edx + 16]
  4373. sub ecx, 4
  4374. jg convertloop
  4375. ret
  4376. }
  4377. }
  4378. #endif // HAS_ARGBSHADEROW_SSE2
  4379. #ifdef HAS_ARGBMULTIPLYROW_SSE2
  4380. // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
  4381. __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
  4382. const uint8_t* src_argb1,
  4383. uint8_t* dst_argb,
  4384. int width) {
  4385. __asm {
  4386. push esi
  4387. mov eax, [esp + 4 + 4] // src_argb0
  4388. mov esi, [esp + 4 + 8] // src_argb1
  4389. mov edx, [esp + 4 + 12] // dst_argb
  4390. mov ecx, [esp + 4 + 16] // width
  4391. pxor xmm5, xmm5 // constant 0
  4392. convertloop:
  4393. movdqu xmm0, [eax] // read 4 pixels from src_argb0
  4394. movdqu xmm2, [esi] // read 4 pixels from src_argb1
  4395. movdqu xmm1, xmm0
  4396. movdqu xmm3, xmm2
  4397. punpcklbw xmm0, xmm0 // first 2
  4398. punpckhbw xmm1, xmm1 // next 2
  4399. punpcklbw xmm2, xmm5 // first 2
  4400. punpckhbw xmm3, xmm5 // next 2
  4401. pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
  4402. pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
  4403. lea eax, [eax + 16]
  4404. lea esi, [esi + 16]
  4405. packuswb xmm0, xmm1
  4406. movdqu [edx], xmm0
  4407. lea edx, [edx + 16]
  4408. sub ecx, 4
  4409. jg convertloop
  4410. pop esi
  4411. ret
  4412. }
  4413. }
  4414. #endif // HAS_ARGBMULTIPLYROW_SSE2
  4415. #ifdef HAS_ARGBADDROW_SSE2
  4416. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  4417. // TODO(fbarchard): Port this to posix, neon and other math functions.
  4418. __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
  4419. const uint8_t* src_argb1,
  4420. uint8_t* dst_argb,
  4421. int width) {
  4422. __asm {
  4423. push esi
  4424. mov eax, [esp + 4 + 4] // src_argb0
  4425. mov esi, [esp + 4 + 8] // src_argb1
  4426. mov edx, [esp + 4 + 12] // dst_argb
  4427. mov ecx, [esp + 4 + 16] // width
  4428. sub ecx, 4
  4429. jl convertloop49
  4430. convertloop4:
  4431. movdqu xmm0, [eax] // read 4 pixels from src_argb0
  4432. lea eax, [eax + 16]
  4433. movdqu xmm1, [esi] // read 4 pixels from src_argb1
  4434. lea esi, [esi + 16]
  4435. paddusb xmm0, xmm1 // src_argb0 + src_argb1
  4436. movdqu [edx], xmm0
  4437. lea edx, [edx + 16]
  4438. sub ecx, 4
  4439. jge convertloop4
  4440. convertloop49:
  4441. add ecx, 4 - 1
  4442. jl convertloop19
  4443. convertloop1:
  4444. movd xmm0, [eax] // read 1 pixels from src_argb0
  4445. lea eax, [eax + 4]
  4446. movd xmm1, [esi] // read 1 pixels from src_argb1
  4447. lea esi, [esi + 4]
  4448. paddusb xmm0, xmm1 // src_argb0 + src_argb1
  4449. movd [edx], xmm0
  4450. lea edx, [edx + 4]
  4451. sub ecx, 1
  4452. jge convertloop1
  4453. convertloop19:
  4454. pop esi
  4455. ret
  4456. }
  4457. }
  4458. #endif // HAS_ARGBADDROW_SSE2
  4459. #ifdef HAS_ARGBSUBTRACTROW_SSE2
  4460. // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
  4461. __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
  4462. const uint8_t* src_argb1,
  4463. uint8_t* dst_argb,
  4464. int width) {
  4465. __asm {
  4466. push esi
  4467. mov eax, [esp + 4 + 4] // src_argb0
  4468. mov esi, [esp + 4 + 8] // src_argb1
  4469. mov edx, [esp + 4 + 12] // dst_argb
  4470. mov ecx, [esp + 4 + 16] // width
  4471. convertloop:
  4472. movdqu xmm0, [eax] // read 4 pixels from src_argb0
  4473. lea eax, [eax + 16]
  4474. movdqu xmm1, [esi] // read 4 pixels from src_argb1
  4475. lea esi, [esi + 16]
  4476. psubusb xmm0, xmm1 // src_argb0 - src_argb1
  4477. movdqu [edx], xmm0
  4478. lea edx, [edx + 16]
  4479. sub ecx, 4
  4480. jg convertloop
  4481. pop esi
  4482. ret
  4483. }
  4484. }
  4485. #endif // HAS_ARGBSUBTRACTROW_SSE2
  4486. #ifdef HAS_ARGBMULTIPLYROW_AVX2
  4487. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  4488. __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
  4489. const uint8_t* src_argb1,
  4490. uint8_t* dst_argb,
  4491. int width) {
  4492. __asm {
  4493. push esi
  4494. mov eax, [esp + 4 + 4] // src_argb0
  4495. mov esi, [esp + 4 + 8] // src_argb1
  4496. mov edx, [esp + 4 + 12] // dst_argb
  4497. mov ecx, [esp + 4 + 16] // width
  4498. vpxor ymm5, ymm5, ymm5 // constant 0
  4499. convertloop:
  4500. vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
  4501. lea eax, [eax + 32]
  4502. vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
  4503. lea esi, [esi + 32]
  4504. vpunpcklbw ymm0, ymm1, ymm1 // low 4
  4505. vpunpckhbw ymm1, ymm1, ymm1 // high 4
  4506. vpunpcklbw ymm2, ymm3, ymm5 // low 4
  4507. vpunpckhbw ymm3, ymm3, ymm5 // high 4
  4508. vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
  4509. vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
  4510. vpackuswb ymm0, ymm0, ymm1
  4511. vmovdqu [edx], ymm0
  4512. lea edx, [edx + 32]
  4513. sub ecx, 8
  4514. jg convertloop
  4515. pop esi
  4516. vzeroupper
  4517. ret
  4518. }
  4519. }
  4520. #endif // HAS_ARGBMULTIPLYROW_AVX2
  4521. #ifdef HAS_ARGBADDROW_AVX2
  4522. // Add 2 rows of ARGB pixels together, 8 pixels at a time.
  4523. __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
  4524. const uint8_t* src_argb1,
  4525. uint8_t* dst_argb,
  4526. int width) {
  4527. __asm {
  4528. push esi
  4529. mov eax, [esp + 4 + 4] // src_argb0
  4530. mov esi, [esp + 4 + 8] // src_argb1
  4531. mov edx, [esp + 4 + 12] // dst_argb
  4532. mov ecx, [esp + 4 + 16] // width
  4533. convertloop:
  4534. vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
  4535. lea eax, [eax + 32]
  4536. vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
  4537. lea esi, [esi + 32]
  4538. vmovdqu [edx], ymm0
  4539. lea edx, [edx + 32]
  4540. sub ecx, 8
  4541. jg convertloop
  4542. pop esi
  4543. vzeroupper
  4544. ret
  4545. }
  4546. }
  4547. #endif // HAS_ARGBADDROW_AVX2
  4548. #ifdef HAS_ARGBSUBTRACTROW_AVX2
  4549. // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
  4550. __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
  4551. const uint8_t* src_argb1,
  4552. uint8_t* dst_argb,
  4553. int width) {
  4554. __asm {
  4555. push esi
  4556. mov eax, [esp + 4 + 4] // src_argb0
  4557. mov esi, [esp + 4 + 8] // src_argb1
  4558. mov edx, [esp + 4 + 12] // dst_argb
  4559. mov ecx, [esp + 4 + 16] // width
  4560. convertloop:
  4561. vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
  4562. lea eax, [eax + 32]
  4563. vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
  4564. lea esi, [esi + 32]
  4565. vmovdqu [edx], ymm0
  4566. lea edx, [edx + 32]
  4567. sub ecx, 8
  4568. jg convertloop
  4569. pop esi
  4570. vzeroupper
  4571. ret
  4572. }
  4573. }
  4574. #endif // HAS_ARGBSUBTRACTROW_AVX2
  4575. #ifdef HAS_SOBELXROW_SSE2
  4576. // SobelX as a matrix is
  4577. // -1 0 1
  4578. // -2 0 2
  4579. // -1 0 1
  4580. __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
  4581. const uint8_t* src_y1,
  4582. const uint8_t* src_y2,
  4583. uint8_t* dst_sobelx,
  4584. int width) {
  4585. __asm {
  4586. push esi
  4587. push edi
  4588. mov eax, [esp + 8 + 4] // src_y0
  4589. mov esi, [esp + 8 + 8] // src_y1
  4590. mov edi, [esp + 8 + 12] // src_y2
  4591. mov edx, [esp + 8 + 16] // dst_sobelx
  4592. mov ecx, [esp + 8 + 20] // width
  4593. sub esi, eax
  4594. sub edi, eax
  4595. sub edx, eax
  4596. pxor xmm5, xmm5 // constant 0
  4597. convertloop:
  4598. movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
  4599. movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
  4600. punpcklbw xmm0, xmm5
  4601. punpcklbw xmm1, xmm5
  4602. psubw xmm0, xmm1
  4603. movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
  4604. movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
  4605. punpcklbw xmm1, xmm5
  4606. punpcklbw xmm2, xmm5
  4607. psubw xmm1, xmm2
  4608. movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
  4609. movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
  4610. punpcklbw xmm2, xmm5
  4611. punpcklbw xmm3, xmm5
  4612. psubw xmm2, xmm3
  4613. paddw xmm0, xmm2
  4614. paddw xmm0, xmm1
  4615. paddw xmm0, xmm1
  4616. pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
  4617. psubw xmm1, xmm0
  4618. pmaxsw xmm0, xmm1
  4619. packuswb xmm0, xmm0
  4620. movq qword ptr [eax + edx], xmm0
  4621. lea eax, [eax + 8]
  4622. sub ecx, 8
  4623. jg convertloop
  4624. pop edi
  4625. pop esi
  4626. ret
  4627. }
  4628. }
  4629. #endif // HAS_SOBELXROW_SSE2
  4630. #ifdef HAS_SOBELYROW_SSE2
  4631. // SobelY as a matrix is
  4632. // -1 -2 -1
  4633. // 0 0 0
  4634. // 1 2 1
  4635. __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
  4636. const uint8_t* src_y1,
  4637. uint8_t* dst_sobely,
  4638. int width) {
  4639. __asm {
  4640. push esi
  4641. mov eax, [esp + 4 + 4] // src_y0
  4642. mov esi, [esp + 4 + 8] // src_y1
  4643. mov edx, [esp + 4 + 12] // dst_sobely
  4644. mov ecx, [esp + 4 + 16] // width
  4645. sub esi, eax
  4646. sub edx, eax
  4647. pxor xmm5, xmm5 // constant 0
  4648. convertloop:
  4649. movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
  4650. movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
  4651. punpcklbw xmm0, xmm5
  4652. punpcklbw xmm1, xmm5
  4653. psubw xmm0, xmm1
  4654. movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
  4655. movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
  4656. punpcklbw xmm1, xmm5
  4657. punpcklbw xmm2, xmm5
  4658. psubw xmm1, xmm2
  4659. movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
  4660. movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
  4661. punpcklbw xmm2, xmm5
  4662. punpcklbw xmm3, xmm5
  4663. psubw xmm2, xmm3
  4664. paddw xmm0, xmm2
  4665. paddw xmm0, xmm1
  4666. paddw xmm0, xmm1
  4667. pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
  4668. psubw xmm1, xmm0
  4669. pmaxsw xmm0, xmm1
  4670. packuswb xmm0, xmm0
  4671. movq qword ptr [eax + edx], xmm0
  4672. lea eax, [eax + 8]
  4673. sub ecx, 8
  4674. jg convertloop
  4675. pop esi
  4676. ret
  4677. }
  4678. }
  4679. #endif // HAS_SOBELYROW_SSE2
  4680. #ifdef HAS_SOBELROW_SSE2
  4681. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  4682. // A = 255
  4683. // R = Sobel
  4684. // G = Sobel
  4685. // B = Sobel
  4686. __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
  4687. const uint8_t* src_sobely,
  4688. uint8_t* dst_argb,
  4689. int width) {
  4690. __asm {
  4691. push esi
  4692. mov eax, [esp + 4 + 4] // src_sobelx
  4693. mov esi, [esp + 4 + 8] // src_sobely
  4694. mov edx, [esp + 4 + 12] // dst_argb
  4695. mov ecx, [esp + 4 + 16] // width
  4696. sub esi, eax
  4697. pcmpeqb xmm5, xmm5 // alpha 255
  4698. pslld xmm5, 24 // 0xff000000
  4699. convertloop:
  4700. movdqu xmm0, [eax] // read 16 pixels src_sobelx
  4701. movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
  4702. lea eax, [eax + 16]
  4703. paddusb xmm0, xmm1 // sobel = sobelx + sobely
  4704. movdqa xmm2, xmm0 // GG
  4705. punpcklbw xmm2, xmm0 // First 8
  4706. punpckhbw xmm0, xmm0 // Next 8
  4707. movdqa xmm1, xmm2 // GGGG
  4708. punpcklwd xmm1, xmm2 // First 4
  4709. punpckhwd xmm2, xmm2 // Next 4
  4710. por xmm1, xmm5 // GGGA
  4711. por xmm2, xmm5
  4712. movdqa xmm3, xmm0 // GGGG
  4713. punpcklwd xmm3, xmm0 // Next 4
  4714. punpckhwd xmm0, xmm0 // Last 4
  4715. por xmm3, xmm5 // GGGA
  4716. por xmm0, xmm5
  4717. movdqu [edx], xmm1
  4718. movdqu [edx + 16], xmm2
  4719. movdqu [edx + 32], xmm3
  4720. movdqu [edx + 48], xmm0
  4721. lea edx, [edx + 64]
  4722. sub ecx, 16
  4723. jg convertloop
  4724. pop esi
  4725. ret
  4726. }
  4727. }
  4728. #endif // HAS_SOBELROW_SSE2
  4729. #ifdef HAS_SOBELTOPLANEROW_SSE2
  4730. // Adds Sobel X and Sobel Y and stores Sobel into a plane.
  4731. __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
  4732. const uint8_t* src_sobely,
  4733. uint8_t* dst_y,
  4734. int width) {
  4735. __asm {
  4736. push esi
  4737. mov eax, [esp + 4 + 4] // src_sobelx
  4738. mov esi, [esp + 4 + 8] // src_sobely
  4739. mov edx, [esp + 4 + 12] // dst_argb
  4740. mov ecx, [esp + 4 + 16] // width
  4741. sub esi, eax
  4742. convertloop:
  4743. movdqu xmm0, [eax] // read 16 pixels src_sobelx
  4744. movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
  4745. lea eax, [eax + 16]
  4746. paddusb xmm0, xmm1 // sobel = sobelx + sobely
  4747. movdqu [edx], xmm0
  4748. lea edx, [edx + 16]
  4749. sub ecx, 16
  4750. jg convertloop
  4751. pop esi
  4752. ret
  4753. }
  4754. }
  4755. #endif // HAS_SOBELTOPLANEROW_SSE2
  4756. #ifdef HAS_SOBELXYROW_SSE2
  4757. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  4758. // A = 255
  4759. // R = Sobel X
  4760. // G = Sobel
  4761. // B = Sobel Y
  4762. __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
  4763. const uint8_t* src_sobely,
  4764. uint8_t* dst_argb,
  4765. int width) {
  4766. __asm {
  4767. push esi
  4768. mov eax, [esp + 4 + 4] // src_sobelx
  4769. mov esi, [esp + 4 + 8] // src_sobely
  4770. mov edx, [esp + 4 + 12] // dst_argb
  4771. mov ecx, [esp + 4 + 16] // width
  4772. sub esi, eax
  4773. pcmpeqb xmm5, xmm5 // alpha 255
  4774. convertloop:
  4775. movdqu xmm0, [eax] // read 16 pixels src_sobelx
  4776. movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
  4777. lea eax, [eax + 16]
  4778. movdqa xmm2, xmm0
  4779. paddusb xmm2, xmm1 // sobel = sobelx + sobely
  4780. movdqa xmm3, xmm0 // XA
  4781. punpcklbw xmm3, xmm5
  4782. punpckhbw xmm0, xmm5
  4783. movdqa xmm4, xmm1 // YS
  4784. punpcklbw xmm4, xmm2
  4785. punpckhbw xmm1, xmm2
  4786. movdqa xmm6, xmm4 // YSXA
  4787. punpcklwd xmm6, xmm3 // First 4
  4788. punpckhwd xmm4, xmm3 // Next 4
  4789. movdqa xmm7, xmm1 // YSXA
  4790. punpcklwd xmm7, xmm0 // Next 4
  4791. punpckhwd xmm1, xmm0 // Last 4
  4792. movdqu [edx], xmm6
  4793. movdqu [edx + 16], xmm4
  4794. movdqu [edx + 32], xmm7
  4795. movdqu [edx + 48], xmm1
  4796. lea edx, [edx + 64]
  4797. sub ecx, 16
  4798. jg convertloop
  4799. pop esi
  4800. ret
  4801. }
  4802. }
  4803. #endif // HAS_SOBELXYROW_SSE2
  4804. #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  4805. // Consider float CumulativeSum.
  4806. // Consider calling CumulativeSum one row at time as needed.
  4807. // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
  4808. // Convert cumulative sum for an area to an average for 1 pixel.
  4809. // topleft is pointer to top left of CumulativeSum buffer for area.
  4810. // botleft is pointer to bottom left of CumulativeSum buffer.
  4811. // width is offset from left to right of area in CumulativeSum buffer measured
  4812. // in number of ints.
  4813. // area is the number of pixels in the area being averaged.
  4814. // dst points to pixel to store result to.
  4815. // count is number of averaged pixels to produce.
  4816. // Does 4 pixels at a time.
  4817. // This function requires alignment on accumulation buffer pointers.
  4818. void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
  4819. const int32_t* botleft,
  4820. int width,
  4821. int area,
  4822. uint8_t* dst,
  4823. int count) {
  4824. __asm {
  4825. mov eax, topleft // eax topleft
  4826. mov esi, botleft // esi botleft
  4827. mov edx, width
  4828. movd xmm5, area
  4829. mov edi, dst
  4830. mov ecx, count
  4831. cvtdq2ps xmm5, xmm5
  4832. rcpss xmm4, xmm5 // 1.0f / area
  4833. pshufd xmm4, xmm4, 0
  4834. sub ecx, 4
  4835. jl l4b
  4836. cmp area, 128 // 128 pixels will not overflow 15 bits.
  4837. ja l4
  4838. pshufd xmm5, xmm5, 0 // area
  4839. pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
  4840. psrld xmm6, 16
  4841. cvtdq2ps xmm6, xmm6
  4842. addps xmm5, xmm6 // (65536.0 + area - 1)
  4843. mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
  4844. cvtps2dq xmm5, xmm5 // 0.16 fixed point
  4845. packssdw xmm5, xmm5 // 16 bit shorts
  4846. // 4 pixel loop small blocks.
  4847. s4:
  4848. // top left
  4849. movdqu xmm0, [eax]
  4850. movdqu xmm1, [eax + 16]
  4851. movdqu xmm2, [eax + 32]
  4852. movdqu xmm3, [eax + 48]
  4853. // - top right
  4854. psubd xmm0, [eax + edx * 4]
  4855. psubd xmm1, [eax + edx * 4 + 16]
  4856. psubd xmm2, [eax + edx * 4 + 32]
  4857. psubd xmm3, [eax + edx * 4 + 48]
  4858. lea eax, [eax + 64]
  4859. // - bottom left
  4860. psubd xmm0, [esi]
  4861. psubd xmm1, [esi + 16]
  4862. psubd xmm2, [esi + 32]
  4863. psubd xmm3, [esi + 48]
  4864. // + bottom right
  4865. paddd xmm0, [esi + edx * 4]
  4866. paddd xmm1, [esi + edx * 4 + 16]
  4867. paddd xmm2, [esi + edx * 4 + 32]
  4868. paddd xmm3, [esi + edx * 4 + 48]
  4869. lea esi, [esi + 64]
  4870. packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
  4871. packssdw xmm2, xmm3
  4872. pmulhuw xmm0, xmm5
  4873. pmulhuw xmm2, xmm5
  4874. packuswb xmm0, xmm2
  4875. movdqu [edi], xmm0
  4876. lea edi, [edi + 16]
  4877. sub ecx, 4
  4878. jge s4
  4879. jmp l4b
  4880. // 4 pixel loop
  4881. l4:
  4882. // top left
  4883. movdqu xmm0, [eax]
  4884. movdqu xmm1, [eax + 16]
  4885. movdqu xmm2, [eax + 32]
  4886. movdqu xmm3, [eax + 48]
  4887. // - top right
  4888. psubd xmm0, [eax + edx * 4]
  4889. psubd xmm1, [eax + edx * 4 + 16]
  4890. psubd xmm2, [eax + edx * 4 + 32]
  4891. psubd xmm3, [eax + edx * 4 + 48]
  4892. lea eax, [eax + 64]
  4893. // - bottom left
  4894. psubd xmm0, [esi]
  4895. psubd xmm1, [esi + 16]
  4896. psubd xmm2, [esi + 32]
  4897. psubd xmm3, [esi + 48]
  4898. // + bottom right
  4899. paddd xmm0, [esi + edx * 4]
  4900. paddd xmm1, [esi + edx * 4 + 16]
  4901. paddd xmm2, [esi + edx * 4 + 32]
  4902. paddd xmm3, [esi + edx * 4 + 48]
  4903. lea esi, [esi + 64]
  4904. cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
  4905. cvtdq2ps xmm1, xmm1
  4906. mulps xmm0, xmm4
  4907. mulps xmm1, xmm4
  4908. cvtdq2ps xmm2, xmm2
  4909. cvtdq2ps xmm3, xmm3
  4910. mulps xmm2, xmm4
  4911. mulps xmm3, xmm4
  4912. cvtps2dq xmm0, xmm0
  4913. cvtps2dq xmm1, xmm1
  4914. cvtps2dq xmm2, xmm2
  4915. cvtps2dq xmm3, xmm3
  4916. packssdw xmm0, xmm1
  4917. packssdw xmm2, xmm3
  4918. packuswb xmm0, xmm2
  4919. movdqu [edi], xmm0
  4920. lea edi, [edi + 16]
  4921. sub ecx, 4
  4922. jge l4
  4923. l4b:
  4924. add ecx, 4 - 1
  4925. jl l1b
  4926. // 1 pixel loop
  4927. l1:
  4928. movdqu xmm0, [eax]
  4929. psubd xmm0, [eax + edx * 4]
  4930. lea eax, [eax + 16]
  4931. psubd xmm0, [esi]
  4932. paddd xmm0, [esi + edx * 4]
  4933. lea esi, [esi + 16]
  4934. cvtdq2ps xmm0, xmm0
  4935. mulps xmm0, xmm4
  4936. cvtps2dq xmm0, xmm0
  4937. packssdw xmm0, xmm0
  4938. packuswb xmm0, xmm0
  4939. movd dword ptr [edi], xmm0
  4940. lea edi, [edi + 4]
  4941. sub ecx, 1
  4942. jge l1
  4943. l1b:
  4944. }
  4945. }
  4946. #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  4947. #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
  4948. // Creates a table of cumulative sums where each value is a sum of all values
  4949. // above and to the left of the value.
  4950. void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
  4951. int32_t* cumsum,
  4952. const int32_t* previous_cumsum,
  4953. int width) {
  4954. __asm {
  4955. mov eax, row
  4956. mov edx, cumsum
  4957. mov esi, previous_cumsum
  4958. mov ecx, width
  4959. pxor xmm0, xmm0
  4960. pxor xmm1, xmm1
  4961. sub ecx, 4
  4962. jl l4b
  4963. test edx, 15
  4964. jne l4b
  4965. // 4 pixel loop
  4966. l4:
  4967. movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
  4968. lea eax, [eax + 16]
  4969. movdqa xmm4, xmm2
  4970. punpcklbw xmm2, xmm1
  4971. movdqa xmm3, xmm2
  4972. punpcklwd xmm2, xmm1
  4973. punpckhwd xmm3, xmm1
  4974. punpckhbw xmm4, xmm1
  4975. movdqa xmm5, xmm4
  4976. punpcklwd xmm4, xmm1
  4977. punpckhwd xmm5, xmm1
  4978. paddd xmm0, xmm2
  4979. movdqu xmm2, [esi] // previous row above.
  4980. paddd xmm2, xmm0
  4981. paddd xmm0, xmm3
  4982. movdqu xmm3, [esi + 16]
  4983. paddd xmm3, xmm0
  4984. paddd xmm0, xmm4
  4985. movdqu xmm4, [esi + 32]
  4986. paddd xmm4, xmm0
  4987. paddd xmm0, xmm5
  4988. movdqu xmm5, [esi + 48]
  4989. lea esi, [esi + 64]
  4990. paddd xmm5, xmm0
  4991. movdqu [edx], xmm2
  4992. movdqu [edx + 16], xmm3
  4993. movdqu [edx + 32], xmm4
  4994. movdqu [edx + 48], xmm5
  4995. lea edx, [edx + 64]
  4996. sub ecx, 4
  4997. jge l4
  4998. l4b:
  4999. add ecx, 4 - 1
  5000. jl l1b
  5001. // 1 pixel loop
  5002. l1:
  5003. movd xmm2, dword ptr [eax] // 1 argb pixel
  5004. lea eax, [eax + 4]
  5005. punpcklbw xmm2, xmm1
  5006. punpcklwd xmm2, xmm1
  5007. paddd xmm0, xmm2
  5008. movdqu xmm2, [esi]
  5009. lea esi, [esi + 16]
  5010. paddd xmm2, xmm0
  5011. movdqu [edx], xmm2
  5012. lea edx, [edx + 16]
  5013. sub ecx, 1
  5014. jge l1
  5015. l1b:
  5016. }
  5017. }
  5018. #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
  5019. #ifdef HAS_ARGBAFFINEROW_SSE2
  5020. // Copy ARGB pixels from source image with slope to a row of destination.
  5021. __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
  5022. int src_argb_stride,
  5023. uint8_t* dst_argb,
  5024. const float* uv_dudv,
  5025. int width) {
  5026. __asm {
  5027. push esi
  5028. push edi
  5029. mov eax, [esp + 12] // src_argb
  5030. mov esi, [esp + 16] // stride
  5031. mov edx, [esp + 20] // dst_argb
  5032. mov ecx, [esp + 24] // pointer to uv_dudv
  5033. movq xmm2, qword ptr [ecx] // uv
  5034. movq xmm7, qword ptr [ecx + 8] // dudv
  5035. mov ecx, [esp + 28] // width
  5036. shl esi, 16 // 4, stride
  5037. add esi, 4
  5038. movd xmm5, esi
  5039. sub ecx, 4
  5040. jl l4b
  5041. // setup for 4 pixel loop
  5042. pshufd xmm7, xmm7, 0x44 // dup dudv
  5043. pshufd xmm5, xmm5, 0 // dup 4, stride
  5044. movdqa xmm0, xmm2 // x0, y0, x1, y1
  5045. addps xmm0, xmm7
  5046. movlhps xmm2, xmm0
  5047. movdqa xmm4, xmm7
  5048. addps xmm4, xmm4 // dudv *= 2
  5049. movdqa xmm3, xmm2 // x2, y2, x3, y3
  5050. addps xmm3, xmm4
  5051. addps xmm4, xmm4 // dudv *= 4
  5052. // 4 pixel loop
  5053. l4:
  5054. cvttps2dq xmm0, xmm2 // x, y float to int first 2
  5055. cvttps2dq xmm1, xmm3 // x, y float to int next 2
  5056. packssdw xmm0, xmm1 // x, y as 8 shorts
  5057. pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
  5058. movd esi, xmm0
  5059. pshufd xmm0, xmm0, 0x39 // shift right
  5060. movd edi, xmm0
  5061. pshufd xmm0, xmm0, 0x39 // shift right
  5062. movd xmm1, [eax + esi] // read pixel 0
  5063. movd xmm6, [eax + edi] // read pixel 1
  5064. punpckldq xmm1, xmm6 // combine pixel 0 and 1
  5065. addps xmm2, xmm4 // x, y += dx, dy first 2
  5066. movq qword ptr [edx], xmm1
  5067. movd esi, xmm0
  5068. pshufd xmm0, xmm0, 0x39 // shift right
  5069. movd edi, xmm0
  5070. movd xmm6, [eax + esi] // read pixel 2
  5071. movd xmm0, [eax + edi] // read pixel 3
  5072. punpckldq xmm6, xmm0 // combine pixel 2 and 3
  5073. addps xmm3, xmm4 // x, y += dx, dy next 2
  5074. movq qword ptr 8[edx], xmm6
  5075. lea edx, [edx + 16]
  5076. sub ecx, 4
  5077. jge l4
  5078. l4b:
  5079. add ecx, 4 - 1
  5080. jl l1b
  5081. // 1 pixel loop
  5082. l1:
  5083. cvttps2dq xmm0, xmm2 // x, y float to int
  5084. packssdw xmm0, xmm0 // x, y as shorts
  5085. pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
  5086. addps xmm2, xmm7 // x, y += dx, dy
  5087. movd esi, xmm0
  5088. movd xmm0, [eax + esi] // copy a pixel
  5089. movd [edx], xmm0
  5090. lea edx, [edx + 4]
  5091. sub ecx, 1
  5092. jge l1
  5093. l1b:
  5094. pop edi
  5095. pop esi
  5096. ret
  5097. }
  5098. }
  5099. #endif // HAS_ARGBAFFINEROW_SSE2
  5100. #ifdef HAS_INTERPOLATEROW_AVX2
  5101. // Bilinear filter 32x2 -> 32x1
  5102. __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
  5103. const uint8_t* src_ptr,
  5104. ptrdiff_t src_stride,
  5105. int dst_width,
  5106. int source_y_fraction) {
  5107. __asm {
  5108. push esi
  5109. push edi
  5110. mov edi, [esp + 8 + 4] // dst_ptr
  5111. mov esi, [esp + 8 + 8] // src_ptr
  5112. mov edx, [esp + 8 + 12] // src_stride
  5113. mov ecx, [esp + 8 + 16] // dst_width
  5114. mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
  5115. // Dispatch to specialized filters if applicable.
  5116. cmp eax, 0
  5117. je xloop100 // 0 / 256. Blend 100 / 0.
  5118. sub edi, esi
  5119. cmp eax, 128
  5120. je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
  5121. vmovd xmm0, eax // high fraction 0..255
  5122. neg eax
  5123. add eax, 256
  5124. vmovd xmm5, eax // low fraction 256..1
  5125. vpunpcklbw xmm5, xmm5, xmm0
  5126. vpunpcklwd xmm5, xmm5, xmm5
  5127. vbroadcastss ymm5, xmm5
  5128. mov eax, 0x80808080 // 128b for bias and rounding.
  5129. vmovd xmm4, eax
  5130. vbroadcastss ymm4, xmm4
  5131. xloop:
  5132. vmovdqu ymm0, [esi]
  5133. vmovdqu ymm2, [esi + edx]
  5134. vpunpckhbw ymm1, ymm0, ymm2 // mutates
  5135. vpunpcklbw ymm0, ymm0, ymm2
  5136. vpsubb ymm1, ymm1, ymm4 // bias to signed image
  5137. vpsubb ymm0, ymm0, ymm4
  5138. vpmaddubsw ymm1, ymm5, ymm1
  5139. vpmaddubsw ymm0, ymm5, ymm0
  5140. vpaddw ymm1, ymm1, ymm4 // unbias and round
  5141. vpaddw ymm0, ymm0, ymm4
  5142. vpsrlw ymm1, ymm1, 8
  5143. vpsrlw ymm0, ymm0, 8
  5144. vpackuswb ymm0, ymm0, ymm1 // unmutates
  5145. vmovdqu [esi + edi], ymm0
  5146. lea esi, [esi + 32]
  5147. sub ecx, 32
  5148. jg xloop
  5149. jmp xloop99
  5150. // Blend 50 / 50.
  5151. xloop50:
  5152. vmovdqu ymm0, [esi]
  5153. vpavgb ymm0, ymm0, [esi + edx]
  5154. vmovdqu [esi + edi], ymm0
  5155. lea esi, [esi + 32]
  5156. sub ecx, 32
  5157. jg xloop50
  5158. jmp xloop99
  5159. // Blend 100 / 0 - Copy row unchanged.
  5160. xloop100:
  5161. rep movsb
  5162. xloop99:
  5163. pop edi
  5164. pop esi
  5165. vzeroupper
  5166. ret
  5167. }
  5168. }
  5169. #endif // HAS_INTERPOLATEROW_AVX2
  5170. // Bilinear filter 16x2 -> 16x1
  5171. // TODO(fbarchard): Consider allowing 256 using memcpy.
  5172. __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
  5173. const uint8_t* src_ptr,
  5174. ptrdiff_t src_stride,
  5175. int dst_width,
  5176. int source_y_fraction) {
  5177. __asm {
  5178. push esi
  5179. push edi
  5180. mov edi, [esp + 8 + 4] // dst_ptr
  5181. mov esi, [esp + 8 + 8] // src_ptr
  5182. mov edx, [esp + 8 + 12] // src_stride
  5183. mov ecx, [esp + 8 + 16] // dst_width
  5184. mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
  5185. sub edi, esi
  5186. // Dispatch to specialized filters if applicable.
  5187. cmp eax, 0
  5188. je xloop100 // 0 /256. Blend 100 / 0.
  5189. cmp eax, 128
  5190. je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
  5191. movd xmm0, eax // high fraction 0..255
  5192. neg eax
  5193. add eax, 256
  5194. movd xmm5, eax // low fraction 255..1
  5195. punpcklbw xmm5, xmm0
  5196. punpcklwd xmm5, xmm5
  5197. pshufd xmm5, xmm5, 0
  5198. mov eax, 0x80808080 // 128 for biasing image to signed.
  5199. movd xmm4, eax
  5200. pshufd xmm4, xmm4, 0x00
  5201. xloop:
  5202. movdqu xmm0, [esi]
  5203. movdqu xmm2, [esi + edx]
  5204. movdqu xmm1, xmm0
  5205. punpcklbw xmm0, xmm2
  5206. punpckhbw xmm1, xmm2
  5207. psubb xmm0, xmm4 // bias image by -128
  5208. psubb xmm1, xmm4
  5209. movdqa xmm2, xmm5
  5210. movdqa xmm3, xmm5
  5211. pmaddubsw xmm2, xmm0
  5212. pmaddubsw xmm3, xmm1
  5213. paddw xmm2, xmm4
  5214. paddw xmm3, xmm4
  5215. psrlw xmm2, 8
  5216. psrlw xmm3, 8
  5217. packuswb xmm2, xmm3
  5218. movdqu [esi + edi], xmm2
  5219. lea esi, [esi + 16]
  5220. sub ecx, 16
  5221. jg xloop
  5222. jmp xloop99
  5223. // Blend 50 / 50.
  5224. xloop50:
  5225. movdqu xmm0, [esi]
  5226. movdqu xmm1, [esi + edx]
  5227. pavgb xmm0, xmm1
  5228. movdqu [esi + edi], xmm0
  5229. lea esi, [esi + 16]
  5230. sub ecx, 16
  5231. jg xloop50
  5232. jmp xloop99
  5233. // Blend 100 / 0 - Copy row unchanged.
  5234. xloop100:
  5235. movdqu xmm0, [esi]
  5236. movdqu [esi + edi], xmm0
  5237. lea esi, [esi + 16]
  5238. sub ecx, 16
  5239. jg xloop100
  5240. xloop99:
  5241. pop edi
  5242. pop esi
  5243. ret
  5244. }
  5245. }
  5246. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5247. __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
  5248. uint8_t* dst_argb,
  5249. const uint8_t* shuffler,
  5250. int width) {
  5251. __asm {
  5252. mov eax, [esp + 4] // src_argb
  5253. mov edx, [esp + 8] // dst_argb
  5254. mov ecx, [esp + 12] // shuffler
  5255. movdqu xmm5, [ecx]
  5256. mov ecx, [esp + 16] // width
  5257. wloop:
  5258. movdqu xmm0, [eax]
  5259. movdqu xmm1, [eax + 16]
  5260. lea eax, [eax + 32]
  5261. pshufb xmm0, xmm5
  5262. pshufb xmm1, xmm5
  5263. movdqu [edx], xmm0
  5264. movdqu [edx + 16], xmm1
  5265. lea edx, [edx + 32]
  5266. sub ecx, 8
  5267. jg wloop
  5268. ret
  5269. }
  5270. }
  5271. #ifdef HAS_ARGBSHUFFLEROW_AVX2
  5272. __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
  5273. uint8_t* dst_argb,
  5274. const uint8_t* shuffler,
  5275. int width) {
  5276. __asm {
  5277. mov eax, [esp + 4] // src_argb
  5278. mov edx, [esp + 8] // dst_argb
  5279. mov ecx, [esp + 12] // shuffler
  5280. vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
  5281. mov ecx, [esp + 16] // width
  5282. wloop:
  5283. vmovdqu ymm0, [eax]
  5284. vmovdqu ymm1, [eax + 32]
  5285. lea eax, [eax + 64]
  5286. vpshufb ymm0, ymm0, ymm5
  5287. vpshufb ymm1, ymm1, ymm5
  5288. vmovdqu [edx], ymm0
  5289. vmovdqu [edx + 32], ymm1
  5290. lea edx, [edx + 64]
  5291. sub ecx, 16
  5292. jg wloop
  5293. vzeroupper
  5294. ret
  5295. }
  5296. }
  5297. #endif // HAS_ARGBSHUFFLEROW_AVX2
  5298. // YUY2 - Macro-pixel = 2 image pixels
  5299. // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
  5300. // UYVY - Macro-pixel = 2 image pixels
  5301. // U0Y0V0Y1
  5302. __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
  5303. const uint8_t* src_u,
  5304. const uint8_t* src_v,
  5305. uint8_t* dst_frame,
  5306. int width) {
  5307. __asm {
  5308. push esi
  5309. push edi
  5310. mov eax, [esp + 8 + 4] // src_y
  5311. mov esi, [esp + 8 + 8] // src_u
  5312. mov edx, [esp + 8 + 12] // src_v
  5313. mov edi, [esp + 8 + 16] // dst_frame
  5314. mov ecx, [esp + 8 + 20] // width
  5315. sub edx, esi
  5316. convertloop:
  5317. movq xmm2, qword ptr [esi] // U
  5318. movq xmm3, qword ptr [esi + edx] // V
  5319. lea esi, [esi + 8]
  5320. punpcklbw xmm2, xmm3 // UV
  5321. movdqu xmm0, [eax] // Y
  5322. lea eax, [eax + 16]
  5323. movdqa xmm1, xmm0
  5324. punpcklbw xmm0, xmm2 // YUYV
  5325. punpckhbw xmm1, xmm2
  5326. movdqu [edi], xmm0
  5327. movdqu [edi + 16], xmm1
  5328. lea edi, [edi + 32]
  5329. sub ecx, 16
  5330. jg convertloop
  5331. pop edi
  5332. pop esi
  5333. ret
  5334. }
  5335. }
  5336. __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
  5337. const uint8_t* src_u,
  5338. const uint8_t* src_v,
  5339. uint8_t* dst_frame,
  5340. int width) {
  5341. __asm {
  5342. push esi
  5343. push edi
  5344. mov eax, [esp + 8 + 4] // src_y
  5345. mov esi, [esp + 8 + 8] // src_u
  5346. mov edx, [esp + 8 + 12] // src_v
  5347. mov edi, [esp + 8 + 16] // dst_frame
  5348. mov ecx, [esp + 8 + 20] // width
  5349. sub edx, esi
  5350. convertloop:
  5351. movq xmm2, qword ptr [esi] // U
  5352. movq xmm3, qword ptr [esi + edx] // V
  5353. lea esi, [esi + 8]
  5354. punpcklbw xmm2, xmm3 // UV
  5355. movdqu xmm0, [eax] // Y
  5356. movdqa xmm1, xmm2
  5357. lea eax, [eax + 16]
  5358. punpcklbw xmm1, xmm0 // UYVY
  5359. punpckhbw xmm2, xmm0
  5360. movdqu [edi], xmm1
  5361. movdqu [edi + 16], xmm2
  5362. lea edi, [edi + 32]
  5363. sub ecx, 16
  5364. jg convertloop
  5365. pop edi
  5366. pop esi
  5367. ret
  5368. }
  5369. }
  5370. #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
  5371. __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
  5372. uint8_t* dst_argb,
  5373. const float* poly,
  5374. int width) {
  5375. __asm {
  5376. push esi
  5377. mov eax, [esp + 4 + 4] /* src_argb */
  5378. mov edx, [esp + 4 + 8] /* dst_argb */
  5379. mov esi, [esp + 4 + 12] /* poly */
  5380. mov ecx, [esp + 4 + 16] /* width */
  5381. pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
  5382. // 2 pixel loop.
  5383. convertloop:
  5384. // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
  5385. // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
  5386. movq xmm0, qword ptr [eax] // BGRABGRA
  5387. lea eax, [eax + 8]
  5388. punpcklbw xmm0, xmm3
  5389. movdqa xmm4, xmm0
  5390. punpcklwd xmm0, xmm3 // pixel 0
  5391. punpckhwd xmm4, xmm3 // pixel 1
  5392. cvtdq2ps xmm0, xmm0 // 4 floats
  5393. cvtdq2ps xmm4, xmm4
  5394. movdqa xmm1, xmm0 // X
  5395. movdqa xmm5, xmm4
  5396. mulps xmm0, [esi + 16] // C1 * X
  5397. mulps xmm4, [esi + 16]
  5398. addps xmm0, [esi] // result = C0 + C1 * X
  5399. addps xmm4, [esi]
  5400. movdqa xmm2, xmm1
  5401. movdqa xmm6, xmm5
  5402. mulps xmm2, xmm1 // X * X
  5403. mulps xmm6, xmm5
  5404. mulps xmm1, xmm2 // X * X * X
  5405. mulps xmm5, xmm6
  5406. mulps xmm2, [esi + 32] // C2 * X * X
  5407. mulps xmm6, [esi + 32]
  5408. mulps xmm1, [esi + 48] // C3 * X * X * X
  5409. mulps xmm5, [esi + 48]
  5410. addps xmm0, xmm2 // result += C2 * X * X
  5411. addps xmm4, xmm6
  5412. addps xmm0, xmm1 // result += C3 * X * X * X
  5413. addps xmm4, xmm5
  5414. cvttps2dq xmm0, xmm0
  5415. cvttps2dq xmm4, xmm4
  5416. packuswb xmm0, xmm4
  5417. packuswb xmm0, xmm0
  5418. movq qword ptr [edx], xmm0
  5419. lea edx, [edx + 8]
  5420. sub ecx, 2
  5421. jg convertloop
  5422. pop esi
  5423. ret
  5424. }
  5425. }
  5426. #endif // HAS_ARGBPOLYNOMIALROW_SSE2
  5427. #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
  5428. __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
  5429. uint8_t* dst_argb,
  5430. const float* poly,
  5431. int width) {
  5432. __asm {
  5433. mov eax, [esp + 4] /* src_argb */
  5434. mov edx, [esp + 8] /* dst_argb */
  5435. mov ecx, [esp + 12] /* poly */
  5436. vbroadcastf128 ymm4, [ecx] // C0
  5437. vbroadcastf128 ymm5, [ecx + 16] // C1
  5438. vbroadcastf128 ymm6, [ecx + 32] // C2
  5439. vbroadcastf128 ymm7, [ecx + 48] // C3
  5440. mov ecx, [esp + 16] /* width */
  5441. // 2 pixel loop.
  5442. convertloop:
  5443. vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
  5444. lea eax, [eax + 8]
  5445. vcvtdq2ps ymm0, ymm0 // X 8 floats
  5446. vmulps ymm2, ymm0, ymm0 // X * X
  5447. vmulps ymm3, ymm0, ymm7 // C3 * X
  5448. vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
  5449. vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
  5450. vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
  5451. vcvttps2dq ymm0, ymm0
  5452. vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
  5453. vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
  5454. vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
  5455. vmovq qword ptr [edx], xmm0
  5456. lea edx, [edx + 8]
  5457. sub ecx, 2
  5458. jg convertloop
  5459. vzeroupper
  5460. ret
  5461. }
  5462. }
  5463. #endif // HAS_ARGBPOLYNOMIALROW_AVX2
  5464. #ifdef HAS_HALFFLOATROW_SSE2
  5465. static float kExpBias = 1.9259299444e-34f;
  5466. __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
  5467. uint16_t* dst,
  5468. float scale,
  5469. int width) {
  5470. __asm {
  5471. mov eax, [esp + 4] /* src */
  5472. mov edx, [esp + 8] /* dst */
  5473. movd xmm4, dword ptr [esp + 12] /* scale */
  5474. mov ecx, [esp + 16] /* width */
  5475. mulss xmm4, kExpBias
  5476. pshufd xmm4, xmm4, 0
  5477. pxor xmm5, xmm5
  5478. sub edx, eax
  5479. // 8 pixel loop.
  5480. convertloop:
  5481. movdqu xmm2, xmmword ptr [eax] // 8 shorts
  5482. add eax, 16
  5483. movdqa xmm3, xmm2
  5484. punpcklwd xmm2, xmm5
  5485. cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
  5486. punpckhwd xmm3, xmm5
  5487. cvtdq2ps xmm3, xmm3
  5488. mulps xmm2, xmm4
  5489. mulps xmm3, xmm4
  5490. psrld xmm2, 13
  5491. psrld xmm3, 13
  5492. packssdw xmm2, xmm3
  5493. movdqu [eax + edx - 16], xmm2
  5494. sub ecx, 8
  5495. jg convertloop
  5496. ret
  5497. }
  5498. }
  5499. #endif // HAS_HALFFLOATROW_SSE2
  5500. #ifdef HAS_HALFFLOATROW_AVX2
  5501. __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
  5502. uint16_t* dst,
  5503. float scale,
  5504. int width) {
  5505. __asm {
  5506. mov eax, [esp + 4] /* src */
  5507. mov edx, [esp + 8] /* dst */
  5508. movd xmm4, dword ptr [esp + 12] /* scale */
  5509. mov ecx, [esp + 16] /* width */
  5510. vmulss xmm4, xmm4, kExpBias
  5511. vbroadcastss ymm4, xmm4
  5512. vpxor ymm5, ymm5, ymm5
  5513. sub edx, eax
  5514. // 16 pixel loop.
  5515. convertloop:
  5516. vmovdqu ymm2, [eax] // 16 shorts
  5517. add eax, 32
  5518. vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
  5519. vpunpcklwd ymm2, ymm2, ymm5
  5520. vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
  5521. vcvtdq2ps ymm2, ymm2
  5522. vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
  5523. vmulps ymm2, ymm2, ymm4
  5524. vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
  5525. vpsrld ymm2, ymm2, 13
  5526. vpackssdw ymm2, ymm2, ymm3
  5527. vmovdqu [eax + edx - 32], ymm2
  5528. sub ecx, 16
  5529. jg convertloop
  5530. vzeroupper
  5531. ret
  5532. }
  5533. }
  5534. #endif // HAS_HALFFLOATROW_AVX2
  5535. #ifdef HAS_HALFFLOATROW_F16C
  5536. __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
  5537. uint16_t* dst,
  5538. float scale,
  5539. int width) {
  5540. __asm {
  5541. mov eax, [esp + 4] /* src */
  5542. mov edx, [esp + 8] /* dst */
  5543. vbroadcastss ymm4, [esp + 12] /* scale */
  5544. mov ecx, [esp + 16] /* width */
  5545. sub edx, eax
  5546. // 16 pixel loop.
  5547. convertloop:
  5548. vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
  5549. vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
  5550. add eax, 32
  5551. vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
  5552. vcvtdq2ps ymm3, ymm3
  5553. vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
  5554. vmulps ymm3, ymm3, ymm4
  5555. vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
  5556. vcvtps2ph xmm3, ymm3, 3
  5557. vmovdqu [eax + edx + 32], xmm2
  5558. vmovdqu [eax + edx + 32 + 16], xmm3
  5559. sub ecx, 16
  5560. jg convertloop
  5561. vzeroupper
  5562. ret
  5563. }
  5564. }
  5565. #endif // HAS_HALFFLOATROW_F16C
  5566. #ifdef HAS_ARGBCOLORTABLEROW_X86
  5567. // Tranform ARGB pixels with color table.
  5568. __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
  5569. const uint8_t* table_argb,
  5570. int width) {
  5571. __asm {
  5572. push esi
  5573. mov eax, [esp + 4 + 4] /* dst_argb */
  5574. mov esi, [esp + 4 + 8] /* table_argb */
  5575. mov ecx, [esp + 4 + 12] /* width */
  5576. // 1 pixel loop.
  5577. convertloop:
  5578. movzx edx, byte ptr [eax]
  5579. lea eax, [eax + 4]
  5580. movzx edx, byte ptr [esi + edx * 4]
  5581. mov byte ptr [eax - 4], dl
  5582. movzx edx, byte ptr [eax - 4 + 1]
  5583. movzx edx, byte ptr [esi + edx * 4 + 1]
  5584. mov byte ptr [eax - 4 + 1], dl
  5585. movzx edx, byte ptr [eax - 4 + 2]
  5586. movzx edx, byte ptr [esi + edx * 4 + 2]
  5587. mov byte ptr [eax - 4 + 2], dl
  5588. movzx edx, byte ptr [eax - 4 + 3]
  5589. movzx edx, byte ptr [esi + edx * 4 + 3]
  5590. mov byte ptr [eax - 4 + 3], dl
  5591. dec ecx
  5592. jg convertloop
  5593. pop esi
  5594. ret
  5595. }
  5596. }
  5597. #endif // HAS_ARGBCOLORTABLEROW_X86
  5598. #ifdef HAS_RGBCOLORTABLEROW_X86
  5599. // Tranform RGB pixels with color table.
  5600. __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
  5601. const uint8_t* table_argb,
  5602. int width) {
  5603. __asm {
  5604. push esi
  5605. mov eax, [esp + 4 + 4] /* dst_argb */
  5606. mov esi, [esp + 4 + 8] /* table_argb */
  5607. mov ecx, [esp + 4 + 12] /* width */
  5608. // 1 pixel loop.
  5609. convertloop:
  5610. movzx edx, byte ptr [eax]
  5611. lea eax, [eax + 4]
  5612. movzx edx, byte ptr [esi + edx * 4]
  5613. mov byte ptr [eax - 4], dl
  5614. movzx edx, byte ptr [eax - 4 + 1]
  5615. movzx edx, byte ptr [esi + edx * 4 + 1]
  5616. mov byte ptr [eax - 4 + 1], dl
  5617. movzx edx, byte ptr [eax - 4 + 2]
  5618. movzx edx, byte ptr [esi + edx * 4 + 2]
  5619. mov byte ptr [eax - 4 + 2], dl
  5620. dec ecx
  5621. jg convertloop
  5622. pop esi
  5623. ret
  5624. }
  5625. }
  5626. #endif // HAS_RGBCOLORTABLEROW_X86
  5627. #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
  5628. // Tranform RGB pixels with luma table.
  5629. __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
  5630. uint8_t* dst_argb,
  5631. int width,
  5632. const uint8_t* luma,
  5633. uint32_t lumacoeff) {
  5634. __asm {
  5635. push esi
  5636. push edi
  5637. mov eax, [esp + 8 + 4] /* src_argb */
  5638. mov edi, [esp + 8 + 8] /* dst_argb */
  5639. mov ecx, [esp + 8 + 12] /* width */
  5640. movd xmm2, dword ptr [esp + 8 + 16] // luma table
  5641. movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
  5642. pshufd xmm2, xmm2, 0
  5643. pshufd xmm3, xmm3, 0
  5644. pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
  5645. psllw xmm4, 8
  5646. pxor xmm5, xmm5
  5647. // 4 pixel loop.
  5648. convertloop:
  5649. movdqu xmm0, xmmword ptr [eax] // generate luma ptr
  5650. pmaddubsw xmm0, xmm3
  5651. phaddw xmm0, xmm0
  5652. pand xmm0, xmm4 // mask out low bits
  5653. punpcklwd xmm0, xmm5
  5654. paddd xmm0, xmm2 // add table base
  5655. movd esi, xmm0
  5656. pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
  5657. movzx edx, byte ptr [eax]
  5658. movzx edx, byte ptr [esi + edx]
  5659. mov byte ptr [edi], dl
  5660. movzx edx, byte ptr [eax + 1]
  5661. movzx edx, byte ptr [esi + edx]
  5662. mov byte ptr [edi + 1], dl
  5663. movzx edx, byte ptr [eax + 2]
  5664. movzx edx, byte ptr [esi + edx]
  5665. mov byte ptr [edi + 2], dl
  5666. movzx edx, byte ptr [eax + 3] // copy alpha.
  5667. mov byte ptr [edi + 3], dl
  5668. movd esi, xmm0
  5669. pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
  5670. movzx edx, byte ptr [eax + 4]
  5671. movzx edx, byte ptr [esi + edx]
  5672. mov byte ptr [edi + 4], dl
  5673. movzx edx, byte ptr [eax + 5]
  5674. movzx edx, byte ptr [esi + edx]
  5675. mov byte ptr [edi + 5], dl
  5676. movzx edx, byte ptr [eax + 6]
  5677. movzx edx, byte ptr [esi + edx]
  5678. mov byte ptr [edi + 6], dl
  5679. movzx edx, byte ptr [eax + 7] // copy alpha.
  5680. mov byte ptr [edi + 7], dl
  5681. movd esi, xmm0
  5682. pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
  5683. movzx edx, byte ptr [eax + 8]
  5684. movzx edx, byte ptr [esi + edx]
  5685. mov byte ptr [edi + 8], dl
  5686. movzx edx, byte ptr [eax + 9]
  5687. movzx edx, byte ptr [esi + edx]
  5688. mov byte ptr [edi + 9], dl
  5689. movzx edx, byte ptr [eax + 10]
  5690. movzx edx, byte ptr [esi + edx]
  5691. mov byte ptr [edi + 10], dl
  5692. movzx edx, byte ptr [eax + 11] // copy alpha.
  5693. mov byte ptr [edi + 11], dl
  5694. movd esi, xmm0
  5695. movzx edx, byte ptr [eax + 12]
  5696. movzx edx, byte ptr [esi + edx]
  5697. mov byte ptr [edi + 12], dl
  5698. movzx edx, byte ptr [eax + 13]
  5699. movzx edx, byte ptr [esi + edx]
  5700. mov byte ptr [edi + 13], dl
  5701. movzx edx, byte ptr [eax + 14]
  5702. movzx edx, byte ptr [esi + edx]
  5703. mov byte ptr [edi + 14], dl
  5704. movzx edx, byte ptr [eax + 15] // copy alpha.
  5705. mov byte ptr [edi + 15], dl
  5706. lea eax, [eax + 16]
  5707. lea edi, [edi + 16]
  5708. sub ecx, 4
  5709. jg convertloop
  5710. pop edi
  5711. pop esi
  5712. ret
  5713. }
  5714. }
  5715. #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
  5716. #endif // defined(_M_X64)
  5717. #ifdef __cplusplus
  5718. } // extern "C"
  5719. } // namespace libyuv
  5720. #endif
  5721. #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))