row_gcc.cc 306 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #ifdef __cplusplus
  12. namespace libyuv {
  13. extern "C" {
  14. #endif
  15. // This module is for GCC x86 and x64.
  16. #if !defined(LIBYUV_DISABLE_X86) && \
  17. (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
  18. #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
  19. // Constants for ARGB
  20. static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
  21. 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
  22. // JPeg full range.
  23. static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
  24. 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
  25. static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
  26. 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
  27. #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
  28. #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
  29. static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
  30. 112, -74, -38, 0, 112, -74, -38, 0};
  31. static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
  32. 127, -84, -43, 0, 127, -84, -43, 0};
  33. static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
  34. -18, -94, 112, 0, -18, -94, 112, 0};
  35. static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
  36. -20, -107, 127, 0, -20, -107, 127, 0};
  37. // Constants for BGRA
  38. static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
  39. 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
  40. static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
  41. 0, -38, -74, 112, 0, -38, -74, 112};
  42. static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
  43. 0, 112, -94, -18, 0, 112, -94, -18};
  44. // Constants for ABGR
  45. static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
  46. 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
  47. static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
  48. -38, -74, 112, 0, -38, -74, 112, 0};
  49. static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
  50. 112, -94, -18, 0, 112, -94, -18, 0};
  51. // Constants for RGBA.
  52. static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
  53. 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
  54. static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
  55. 0, 112, -74, -38, 0, 112, -74, -38};
  56. static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
  57. 0, -18, -94, 112, 0, -18, -94, 112};
  58. static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
  59. 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
  60. static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
  61. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  62. static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
  63. 0x8080u, 0x8080u, 0x8080u, 0x8080u};
  64. #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
  65. #ifdef HAS_RGB24TOARGBROW_SSSE3
  66. // Shuffle table for converting RGB24 to ARGB.
  67. static const uvec8 kShuffleMaskRGB24ToARGB = {
  68. 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
  69. // Shuffle table for converting RAW to ARGB.
  70. static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
  71. 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
  72. // Shuffle table for converting RAW to RGBA.
  73. static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u, 4u, 3u,
  74. 14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
  75. // Shuffle table for converting RAW to RGB24. First 8.
  76. static const uvec8 kShuffleMaskRAWToRGB24_0 = {
  77. 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
  78. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  79. // Shuffle table for converting RAW to RGB24. Middle 8.
  80. static const uvec8 kShuffleMaskRAWToRGB24_1 = {
  81. 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
  82. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  83. // Shuffle table for converting RAW to RGB24. Last 8.
  84. static const uvec8 kShuffleMaskRAWToRGB24_2 = {
  85. 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
  86. 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
  87. // Shuffle table for converting ARGB to RGB24.
  88. static const uvec8 kShuffleMaskARGBToRGB24 = {
  89. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
  90. // Shuffle table for converting ARGB to RAW.
  91. static const uvec8 kShuffleMaskARGBToRAW = {
  92. 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
  93. // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
  94. static const uvec8 kShuffleMaskARGBToRGB24_0 = {
  95. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
  96. // YUY2 shuf 16 Y to 32 Y.
  97. static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
  98. 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
  99. 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
  100. // YUY2 shuf 8 UV to 16 UV.
  101. static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
  102. 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
  103. 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
  104. // UYVY shuf 16 Y to 32 Y.
  105. static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
  106. 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
  107. 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
  108. // UYVY shuf 8 UV to 16 UV.
  109. static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
  110. 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
  111. 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
  112. // NV21 shuf 8 VU to 16 UV.
  113. static const lvec8 kShuffleNV21 = {
  114. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  115. 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
  116. };
  117. #endif // HAS_RGB24TOARGBROW_SSSE3
  118. #ifdef HAS_J400TOARGBROW_SSE2
  119. void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  120. asm volatile(
  121. "pcmpeqb %%xmm5,%%xmm5 \n"
  122. "pslld $0x18,%%xmm5 \n"
  123. LABELALIGN
  124. "1: \n"
  125. "movq (%0),%%xmm0 \n"
  126. "lea 0x8(%0),%0 \n"
  127. "punpcklbw %%xmm0,%%xmm0 \n"
  128. "movdqa %%xmm0,%%xmm1 \n"
  129. "punpcklwd %%xmm0,%%xmm0 \n"
  130. "punpckhwd %%xmm1,%%xmm1 \n"
  131. "por %%xmm5,%%xmm0 \n"
  132. "por %%xmm5,%%xmm1 \n"
  133. "movdqu %%xmm0,(%1) \n"
  134. "movdqu %%xmm1,0x10(%1) \n"
  135. "lea 0x20(%1),%1 \n"
  136. "sub $0x8,%2 \n"
  137. "jg 1b \n"
  138. : "+r"(src_y), // %0
  139. "+r"(dst_argb), // %1
  140. "+r"(width) // %2
  141. ::"memory",
  142. "cc", "xmm0", "xmm1", "xmm5");
  143. }
  144. #endif // HAS_J400TOARGBROW_SSE2
  145. #ifdef HAS_RGB24TOARGBROW_SSSE3
  146. void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
  147. uint8_t* dst_argb,
  148. int width) {
  149. asm volatile(
  150. "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
  151. "pslld $0x18,%%xmm5 \n"
  152. "movdqa %3,%%xmm4 \n"
  153. LABELALIGN
  154. "1: \n"
  155. "movdqu (%0),%%xmm0 \n"
  156. "movdqu 0x10(%0),%%xmm1 \n"
  157. "movdqu 0x20(%0),%%xmm3 \n"
  158. "lea 0x30(%0),%0 \n"
  159. "movdqa %%xmm3,%%xmm2 \n"
  160. "palignr $0x8,%%xmm1,%%xmm2 \n"
  161. "pshufb %%xmm4,%%xmm2 \n"
  162. "por %%xmm5,%%xmm2 \n"
  163. "palignr $0xc,%%xmm0,%%xmm1 \n"
  164. "pshufb %%xmm4,%%xmm0 \n"
  165. "movdqu %%xmm2,0x20(%1) \n"
  166. "por %%xmm5,%%xmm0 \n"
  167. "pshufb %%xmm4,%%xmm1 \n"
  168. "movdqu %%xmm0,(%1) \n"
  169. "por %%xmm5,%%xmm1 \n"
  170. "palignr $0x4,%%xmm3,%%xmm3 \n"
  171. "pshufb %%xmm4,%%xmm3 \n"
  172. "movdqu %%xmm1,0x10(%1) \n"
  173. "por %%xmm5,%%xmm3 \n"
  174. "movdqu %%xmm3,0x30(%1) \n"
  175. "lea 0x40(%1),%1 \n"
  176. "sub $0x10,%2 \n"
  177. "jg 1b \n"
  178. : "+r"(src_rgb24), // %0
  179. "+r"(dst_argb), // %1
  180. "+r"(width) // %2
  181. : "m"(kShuffleMaskRGB24ToARGB) // %3
  182. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  183. }
  184. void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
  185. asm volatile(
  186. "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
  187. "pslld $0x18,%%xmm5 \n"
  188. "movdqa %3,%%xmm4 \n"
  189. LABELALIGN
  190. "1: \n"
  191. "movdqu (%0),%%xmm0 \n"
  192. "movdqu 0x10(%0),%%xmm1 \n"
  193. "movdqu 0x20(%0),%%xmm3 \n"
  194. "lea 0x30(%0),%0 \n"
  195. "movdqa %%xmm3,%%xmm2 \n"
  196. "palignr $0x8,%%xmm1,%%xmm2 \n"
  197. "pshufb %%xmm4,%%xmm2 \n"
  198. "por %%xmm5,%%xmm2 \n"
  199. "palignr $0xc,%%xmm0,%%xmm1 \n"
  200. "pshufb %%xmm4,%%xmm0 \n"
  201. "movdqu %%xmm2,0x20(%1) \n"
  202. "por %%xmm5,%%xmm0 \n"
  203. "pshufb %%xmm4,%%xmm1 \n"
  204. "movdqu %%xmm0,(%1) \n"
  205. "por %%xmm5,%%xmm1 \n"
  206. "palignr $0x4,%%xmm3,%%xmm3 \n"
  207. "pshufb %%xmm4,%%xmm3 \n"
  208. "movdqu %%xmm1,0x10(%1) \n"
  209. "por %%xmm5,%%xmm3 \n"
  210. "movdqu %%xmm3,0x30(%1) \n"
  211. "lea 0x40(%1),%1 \n"
  212. "sub $0x10,%2 \n"
  213. "jg 1b \n"
  214. : "+r"(src_raw), // %0
  215. "+r"(dst_argb), // %1
  216. "+r"(width) // %2
  217. : "m"(kShuffleMaskRAWToARGB) // %3
  218. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  219. }
  220. // Same code as RAWToARGB with different shuffler and A in low bits
  221. void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
  222. asm volatile(
  223. "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
  224. "psrld $0x18,%%xmm5 \n"
  225. "movdqa %3,%%xmm4 \n"
  226. LABELALIGN
  227. "1: \n"
  228. "movdqu (%0),%%xmm0 \n"
  229. "movdqu 0x10(%0),%%xmm1 \n"
  230. "movdqu 0x20(%0),%%xmm3 \n"
  231. "lea 0x30(%0),%0 \n"
  232. "movdqa %%xmm3,%%xmm2 \n"
  233. "palignr $0x8,%%xmm1,%%xmm2 \n"
  234. "pshufb %%xmm4,%%xmm2 \n"
  235. "por %%xmm5,%%xmm2 \n"
  236. "palignr $0xc,%%xmm0,%%xmm1 \n"
  237. "pshufb %%xmm4,%%xmm0 \n"
  238. "movdqu %%xmm2,0x20(%1) \n"
  239. "por %%xmm5,%%xmm0 \n"
  240. "pshufb %%xmm4,%%xmm1 \n"
  241. "movdqu %%xmm0,(%1) \n"
  242. "por %%xmm5,%%xmm1 \n"
  243. "palignr $0x4,%%xmm3,%%xmm3 \n"
  244. "pshufb %%xmm4,%%xmm3 \n"
  245. "movdqu %%xmm1,0x10(%1) \n"
  246. "por %%xmm5,%%xmm3 \n"
  247. "movdqu %%xmm3,0x30(%1) \n"
  248. "lea 0x40(%1),%1 \n"
  249. "sub $0x10,%2 \n"
  250. "jg 1b \n"
  251. : "+r"(src_raw), // %0
  252. "+r"(dst_rgba), // %1
  253. "+r"(width) // %2
  254. : "m"(kShuffleMaskRAWToRGBA) // %3
  255. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  256. }
  257. void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
  258. uint8_t* dst_rgb24,
  259. int width) {
  260. asm volatile(
  261. "movdqa %3,%%xmm3 \n"
  262. "movdqa %4,%%xmm4 \n"
  263. "movdqa %5,%%xmm5 \n"
  264. LABELALIGN
  265. "1: \n"
  266. "movdqu (%0),%%xmm0 \n"
  267. "movdqu 0x4(%0),%%xmm1 \n"
  268. "movdqu 0x8(%0),%%xmm2 \n"
  269. "lea 0x18(%0),%0 \n"
  270. "pshufb %%xmm3,%%xmm0 \n"
  271. "pshufb %%xmm4,%%xmm1 \n"
  272. "pshufb %%xmm5,%%xmm2 \n"
  273. "movq %%xmm0,(%1) \n"
  274. "movq %%xmm1,0x8(%1) \n"
  275. "movq %%xmm2,0x10(%1) \n"
  276. "lea 0x18(%1),%1 \n"
  277. "sub $0x8,%2 \n"
  278. "jg 1b \n"
  279. : "+r"(src_raw), // %0
  280. "+r"(dst_rgb24), // %1
  281. "+r"(width) // %2
  282. : "m"(kShuffleMaskRAWToRGB24_0), // %3
  283. "m"(kShuffleMaskRAWToRGB24_1), // %4
  284. "m"(kShuffleMaskRAWToRGB24_2) // %5
  285. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  286. }
  287. void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  288. asm volatile(
  289. "mov $0x1080108,%%eax \n"
  290. "movd %%eax,%%xmm5 \n"
  291. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  292. "mov $0x20802080,%%eax \n"
  293. "movd %%eax,%%xmm6 \n"
  294. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  295. "pcmpeqb %%xmm3,%%xmm3 \n"
  296. "psllw $0xb,%%xmm3 \n"
  297. "pcmpeqb %%xmm4,%%xmm4 \n"
  298. "psllw $0xa,%%xmm4 \n"
  299. "psrlw $0x5,%%xmm4 \n"
  300. "pcmpeqb %%xmm7,%%xmm7 \n"
  301. "psllw $0x8,%%xmm7 \n"
  302. "sub %0,%1 \n"
  303. "sub %0,%1 \n"
  304. LABELALIGN
  305. "1: \n"
  306. "movdqu (%0),%%xmm0 \n"
  307. "movdqa %%xmm0,%%xmm1 \n"
  308. "movdqa %%xmm0,%%xmm2 \n"
  309. "pand %%xmm3,%%xmm1 \n"
  310. "psllw $0xb,%%xmm2 \n"
  311. "pmulhuw %%xmm5,%%xmm1 \n"
  312. "pmulhuw %%xmm5,%%xmm2 \n"
  313. "psllw $0x8,%%xmm1 \n"
  314. "por %%xmm2,%%xmm1 \n"
  315. "pand %%xmm4,%%xmm0 \n"
  316. "pmulhuw %%xmm6,%%xmm0 \n"
  317. "por %%xmm7,%%xmm0 \n"
  318. "movdqa %%xmm1,%%xmm2 \n"
  319. "punpcklbw %%xmm0,%%xmm1 \n"
  320. "punpckhbw %%xmm0,%%xmm2 \n"
  321. "movdqu %%xmm1,0x00(%1,%0,2) \n"
  322. "movdqu %%xmm2,0x10(%1,%0,2) \n"
  323. "lea 0x10(%0),%0 \n"
  324. "sub $0x8,%2 \n"
  325. "jg 1b \n"
  326. : "+r"(src), // %0
  327. "+r"(dst), // %1
  328. "+r"(width) // %2
  329. :
  330. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
  331. "xmm6", "xmm7");
  332. }
  333. void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  334. asm volatile(
  335. "mov $0x1080108,%%eax \n"
  336. "movd %%eax,%%xmm5 \n"
  337. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  338. "mov $0x42004200,%%eax \n"
  339. "movd %%eax,%%xmm6 \n"
  340. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  341. "pcmpeqb %%xmm3,%%xmm3 \n"
  342. "psllw $0xb,%%xmm3 \n"
  343. "movdqa %%xmm3,%%xmm4 \n"
  344. "psrlw $0x6,%%xmm4 \n"
  345. "pcmpeqb %%xmm7,%%xmm7 \n"
  346. "psllw $0x8,%%xmm7 \n"
  347. "sub %0,%1 \n"
  348. "sub %0,%1 \n"
  349. LABELALIGN
  350. "1: \n"
  351. "movdqu (%0),%%xmm0 \n"
  352. "movdqa %%xmm0,%%xmm1 \n"
  353. "movdqa %%xmm0,%%xmm2 \n"
  354. "psllw $0x1,%%xmm1 \n"
  355. "psllw $0xb,%%xmm2 \n"
  356. "pand %%xmm3,%%xmm1 \n"
  357. "pmulhuw %%xmm5,%%xmm2 \n"
  358. "pmulhuw %%xmm5,%%xmm1 \n"
  359. "psllw $0x8,%%xmm1 \n"
  360. "por %%xmm2,%%xmm1 \n"
  361. "movdqa %%xmm0,%%xmm2 \n"
  362. "pand %%xmm4,%%xmm0 \n"
  363. "psraw $0x8,%%xmm2 \n"
  364. "pmulhuw %%xmm6,%%xmm0 \n"
  365. "pand %%xmm7,%%xmm2 \n"
  366. "por %%xmm2,%%xmm0 \n"
  367. "movdqa %%xmm1,%%xmm2 \n"
  368. "punpcklbw %%xmm0,%%xmm1 \n"
  369. "punpckhbw %%xmm0,%%xmm2 \n"
  370. "movdqu %%xmm1,0x00(%1,%0,2) \n"
  371. "movdqu %%xmm2,0x10(%1,%0,2) \n"
  372. "lea 0x10(%0),%0 \n"
  373. "sub $0x8,%2 \n"
  374. "jg 1b \n"
  375. : "+r"(src), // %0
  376. "+r"(dst), // %1
  377. "+r"(width) // %2
  378. :
  379. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
  380. "xmm6", "xmm7");
  381. }
  382. void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  383. asm volatile(
  384. "mov $0xf0f0f0f,%%eax \n"
  385. "movd %%eax,%%xmm4 \n"
  386. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  387. "movdqa %%xmm4,%%xmm5 \n"
  388. "pslld $0x4,%%xmm5 \n"
  389. "sub %0,%1 \n"
  390. "sub %0,%1 \n"
  391. LABELALIGN
  392. "1: \n"
  393. "movdqu (%0),%%xmm0 \n"
  394. "movdqa %%xmm0,%%xmm2 \n"
  395. "pand %%xmm4,%%xmm0 \n"
  396. "pand %%xmm5,%%xmm2 \n"
  397. "movdqa %%xmm0,%%xmm1 \n"
  398. "movdqa %%xmm2,%%xmm3 \n"
  399. "psllw $0x4,%%xmm1 \n"
  400. "psrlw $0x4,%%xmm3 \n"
  401. "por %%xmm1,%%xmm0 \n"
  402. "por %%xmm3,%%xmm2 \n"
  403. "movdqa %%xmm0,%%xmm1 \n"
  404. "punpcklbw %%xmm2,%%xmm0 \n"
  405. "punpckhbw %%xmm2,%%xmm1 \n"
  406. "movdqu %%xmm0,0x00(%1,%0,2) \n"
  407. "movdqu %%xmm1,0x10(%1,%0,2) \n"
  408. "lea 0x10(%0),%0 \n"
  409. "sub $0x8,%2 \n"
  410. "jg 1b \n"
  411. : "+r"(src), // %0
  412. "+r"(dst), // %1
  413. "+r"(width) // %2
  414. :
  415. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  416. }
  417. void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  418. asm volatile(
  419. "movdqa %3,%%xmm6 \n"
  420. LABELALIGN
  421. "1: \n"
  422. "movdqu (%0),%%xmm0 \n"
  423. "movdqu 0x10(%0),%%xmm1 \n"
  424. "movdqu 0x20(%0),%%xmm2 \n"
  425. "movdqu 0x30(%0),%%xmm3 \n"
  426. "lea 0x40(%0),%0 \n"
  427. "pshufb %%xmm6,%%xmm0 \n"
  428. "pshufb %%xmm6,%%xmm1 \n"
  429. "pshufb %%xmm6,%%xmm2 \n"
  430. "pshufb %%xmm6,%%xmm3 \n"
  431. "movdqa %%xmm1,%%xmm4 \n"
  432. "psrldq $0x4,%%xmm1 \n"
  433. "pslldq $0xc,%%xmm4 \n"
  434. "movdqa %%xmm2,%%xmm5 \n"
  435. "por %%xmm4,%%xmm0 \n"
  436. "pslldq $0x8,%%xmm5 \n"
  437. "movdqu %%xmm0,(%1) \n"
  438. "por %%xmm5,%%xmm1 \n"
  439. "psrldq $0x8,%%xmm2 \n"
  440. "pslldq $0x4,%%xmm3 \n"
  441. "por %%xmm3,%%xmm2 \n"
  442. "movdqu %%xmm1,0x10(%1) \n"
  443. "movdqu %%xmm2,0x20(%1) \n"
  444. "lea 0x30(%1),%1 \n"
  445. "sub $0x10,%2 \n"
  446. "jg 1b \n"
  447. : "+r"(src), // %0
  448. "+r"(dst), // %1
  449. "+r"(width) // %2
  450. : "m"(kShuffleMaskARGBToRGB24) // %3
  451. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  452. }
  453. void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  454. asm volatile(
  455. "movdqa %3,%%xmm6 \n"
  456. LABELALIGN
  457. "1: \n"
  458. "movdqu (%0),%%xmm0 \n"
  459. "movdqu 0x10(%0),%%xmm1 \n"
  460. "movdqu 0x20(%0),%%xmm2 \n"
  461. "movdqu 0x30(%0),%%xmm3 \n"
  462. "lea 0x40(%0),%0 \n"
  463. "pshufb %%xmm6,%%xmm0 \n"
  464. "pshufb %%xmm6,%%xmm1 \n"
  465. "pshufb %%xmm6,%%xmm2 \n"
  466. "pshufb %%xmm6,%%xmm3 \n"
  467. "movdqa %%xmm1,%%xmm4 \n"
  468. "psrldq $0x4,%%xmm1 \n"
  469. "pslldq $0xc,%%xmm4 \n"
  470. "movdqa %%xmm2,%%xmm5 \n"
  471. "por %%xmm4,%%xmm0 \n"
  472. "pslldq $0x8,%%xmm5 \n"
  473. "movdqu %%xmm0,(%1) \n"
  474. "por %%xmm5,%%xmm1 \n"
  475. "psrldq $0x8,%%xmm2 \n"
  476. "pslldq $0x4,%%xmm3 \n"
  477. "por %%xmm3,%%xmm2 \n"
  478. "movdqu %%xmm1,0x10(%1) \n"
  479. "movdqu %%xmm2,0x20(%1) \n"
  480. "lea 0x30(%1),%1 \n"
  481. "sub $0x10,%2 \n"
  482. "jg 1b \n"
  483. : "+r"(src), // %0
  484. "+r"(dst), // %1
  485. "+r"(width) // %2
  486. : "m"(kShuffleMaskARGBToRAW) // %3
  487. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  488. }
  489. #ifdef HAS_ARGBTORGB24ROW_AVX2
  490. // vpermd for 12+12 to 24
  491. static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
  492. void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  493. asm volatile(
  494. "vbroadcastf128 %3,%%ymm6 \n"
  495. "vmovdqa %4,%%ymm7 \n"
  496. LABELALIGN
  497. "1: \n"
  498. "vmovdqu (%0),%%ymm0 \n"
  499. "vmovdqu 0x20(%0),%%ymm1 \n"
  500. "vmovdqu 0x40(%0),%%ymm2 \n"
  501. "vmovdqu 0x60(%0),%%ymm3 \n"
  502. "lea 0x80(%0),%0 \n"
  503. "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
  504. "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
  505. "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
  506. "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
  507. "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
  508. "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
  509. "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
  510. "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
  511. "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
  512. "vpor %%ymm4,%%ymm0,%%ymm0 \n"
  513. "vmovdqu %%ymm0,(%1) \n"
  514. "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
  515. "vpermq $0x4f,%%ymm2,%%ymm4 \n"
  516. "vpor %%ymm4,%%ymm1,%%ymm1 \n"
  517. "vmovdqu %%ymm1,0x20(%1) \n"
  518. "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
  519. "vpermq $0x93,%%ymm3,%%ymm3 \n"
  520. "vpor %%ymm3,%%ymm2,%%ymm2 \n"
  521. "vmovdqu %%ymm2,0x40(%1) \n"
  522. "lea 0x60(%1),%1 \n"
  523. "sub $0x20,%2 \n"
  524. "jg 1b \n"
  525. "vzeroupper \n"
  526. : "+r"(src), // %0
  527. "+r"(dst), // %1
  528. "+r"(width) // %2
  529. : "m"(kShuffleMaskARGBToRGB24), // %3
  530. "m"(kPermdRGB24_AVX) // %4
  531. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  532. "xmm7");
  533. }
  534. #endif
  535. #ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
  536. // Shuffle table for converting ARGBToRGB24
  537. static const ulvec8 kPermARGBToRGB24_0 = {
  538. 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u,
  539. 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
  540. 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
  541. static const ulvec8 kPermARGBToRGB24_1 = {
  542. 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
  543. 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
  544. 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
  545. static const ulvec8 kPermARGBToRGB24_2 = {
  546. 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
  547. 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
  548. 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
  549. void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
  550. asm volatile(
  551. "vmovdqa %3,%%ymm5 \n"
  552. "vmovdqa %4,%%ymm6 \n"
  553. "vmovdqa %5,%%ymm7 \n"
  554. LABELALIGN
  555. "1: \n"
  556. "vmovdqu (%0),%%ymm0 \n"
  557. "vmovdqu 0x20(%0),%%ymm1 \n"
  558. "vmovdqu 0x40(%0),%%ymm2 \n"
  559. "vmovdqu 0x60(%0),%%ymm3 \n"
  560. "lea 0x80(%0),%0 \n"
  561. "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
  562. "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
  563. "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
  564. "vmovdqu %%ymm0,(%1) \n"
  565. "vmovdqu %%ymm1,0x20(%1) \n"
  566. "vmovdqu %%ymm2,0x40(%1) \n"
  567. "lea 0x60(%1),%1 \n"
  568. "sub $0x20,%2 \n"
  569. "jg 1b \n"
  570. "vzeroupper \n"
  571. : "+r"(src), // %0
  572. "+r"(dst), // %1
  573. "+r"(width) // %2
  574. : "m"(kPermARGBToRGB24_0), // %3
  575. "m"(kPermARGBToRGB24_1), // %4
  576. "m"(kPermARGBToRGB24_2) // %5
  577. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
  578. }
  579. #endif
  580. #ifdef HAS_ARGBTORAWROW_AVX2
  581. void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  582. asm volatile(
  583. "vbroadcastf128 %3,%%ymm6 \n"
  584. "vmovdqa %4,%%ymm7 \n"
  585. LABELALIGN
  586. "1: \n"
  587. "vmovdqu (%0),%%ymm0 \n"
  588. "vmovdqu 0x20(%0),%%ymm1 \n"
  589. "vmovdqu 0x40(%0),%%ymm2 \n"
  590. "vmovdqu 0x60(%0),%%ymm3 \n"
  591. "lea 0x80(%0),%0 \n"
  592. "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
  593. "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
  594. "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
  595. "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
  596. "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
  597. "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
  598. "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
  599. "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
  600. "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
  601. "vpor %%ymm4,%%ymm0,%%ymm0 \n"
  602. "vmovdqu %%ymm0,(%1) \n"
  603. "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
  604. "vpermq $0x4f,%%ymm2,%%ymm4 \n"
  605. "vpor %%ymm4,%%ymm1,%%ymm1 \n"
  606. "vmovdqu %%ymm1,0x20(%1) \n"
  607. "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
  608. "vpermq $0x93,%%ymm3,%%ymm3 \n"
  609. "vpor %%ymm3,%%ymm2,%%ymm2 \n"
  610. "vmovdqu %%ymm2,0x40(%1) \n"
  611. "lea 0x60(%1),%1 \n"
  612. "sub $0x20,%2 \n"
  613. "jg 1b \n"
  614. "vzeroupper \n"
  615. : "+r"(src), // %0
  616. "+r"(dst), // %1
  617. "+r"(width) // %2
  618. : "m"(kShuffleMaskARGBToRAW), // %3
  619. "m"(kPermdRGB24_AVX) // %4
  620. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  621. "xmm7");
  622. }
  623. #endif
  624. void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  625. asm volatile(
  626. "pcmpeqb %%xmm3,%%xmm3 \n"
  627. "psrld $0x1b,%%xmm3 \n"
  628. "pcmpeqb %%xmm4,%%xmm4 \n"
  629. "psrld $0x1a,%%xmm4 \n"
  630. "pslld $0x5,%%xmm4 \n"
  631. "pcmpeqb %%xmm5,%%xmm5 \n"
  632. "pslld $0xb,%%xmm5 \n"
  633. LABELALIGN
  634. "1: \n"
  635. "movdqu (%0),%%xmm0 \n"
  636. "movdqa %%xmm0,%%xmm1 \n"
  637. "movdqa %%xmm0,%%xmm2 \n"
  638. "pslld $0x8,%%xmm0 \n"
  639. "psrld $0x3,%%xmm1 \n"
  640. "psrld $0x5,%%xmm2 \n"
  641. "psrad $0x10,%%xmm0 \n"
  642. "pand %%xmm3,%%xmm1 \n"
  643. "pand %%xmm4,%%xmm2 \n"
  644. "pand %%xmm5,%%xmm0 \n"
  645. "por %%xmm2,%%xmm1 \n"
  646. "por %%xmm1,%%xmm0 \n"
  647. "packssdw %%xmm0,%%xmm0 \n"
  648. "lea 0x10(%0),%0 \n"
  649. "movq %%xmm0,(%1) \n"
  650. "lea 0x8(%1),%1 \n"
  651. "sub $0x4,%2 \n"
  652. "jg 1b \n"
  653. : "+r"(src), // %0
  654. "+r"(dst), // %1
  655. "+r"(width) // %2
  656. ::"memory",
  657. "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  658. }
  659. void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
  660. uint8_t* dst,
  661. const uint32_t dither4,
  662. int width) {
  663. asm volatile(
  664. "movd %3,%%xmm6 \n"
  665. "punpcklbw %%xmm6,%%xmm6 \n"
  666. "movdqa %%xmm6,%%xmm7 \n"
  667. "punpcklwd %%xmm6,%%xmm6 \n"
  668. "punpckhwd %%xmm7,%%xmm7 \n"
  669. "pcmpeqb %%xmm3,%%xmm3 \n"
  670. "psrld $0x1b,%%xmm3 \n"
  671. "pcmpeqb %%xmm4,%%xmm4 \n"
  672. "psrld $0x1a,%%xmm4 \n"
  673. "pslld $0x5,%%xmm4 \n"
  674. "pcmpeqb %%xmm5,%%xmm5 \n"
  675. "pslld $0xb,%%xmm5 \n"
  676. LABELALIGN
  677. "1: \n"
  678. "movdqu (%0),%%xmm0 \n"
  679. "paddusb %%xmm6,%%xmm0 \n"
  680. "movdqa %%xmm0,%%xmm1 \n"
  681. "movdqa %%xmm0,%%xmm2 \n"
  682. "pslld $0x8,%%xmm0 \n"
  683. "psrld $0x3,%%xmm1 \n"
  684. "psrld $0x5,%%xmm2 \n"
  685. "psrad $0x10,%%xmm0 \n"
  686. "pand %%xmm3,%%xmm1 \n"
  687. "pand %%xmm4,%%xmm2 \n"
  688. "pand %%xmm5,%%xmm0 \n"
  689. "por %%xmm2,%%xmm1 \n"
  690. "por %%xmm1,%%xmm0 \n"
  691. "packssdw %%xmm0,%%xmm0 \n"
  692. "lea 0x10(%0),%0 \n"
  693. "movq %%xmm0,(%1) \n"
  694. "lea 0x8(%1),%1 \n"
  695. "sub $0x4,%2 \n"
  696. "jg 1b \n"
  697. : "+r"(src), // %0
  698. "+r"(dst), // %1
  699. "+r"(width) // %2
  700. : "m"(dither4) // %3
  701. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  702. "xmm7");
  703. }
  704. #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
  705. void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
  706. uint8_t* dst,
  707. const uint32_t dither4,
  708. int width) {
  709. asm volatile(
  710. "vbroadcastss %3,%%xmm6 \n"
  711. "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
  712. "vpermq $0xd8,%%ymm6,%%ymm6 \n"
  713. "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
  714. "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
  715. "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
  716. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  717. "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
  718. "vpslld $0x5,%%ymm4,%%ymm4 \n"
  719. "vpslld $0xb,%%ymm3,%%ymm5 \n"
  720. LABELALIGN
  721. "1: \n"
  722. "vmovdqu (%0),%%ymm0 \n"
  723. "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
  724. "vpsrld $0x5,%%ymm0,%%ymm2 \n"
  725. "vpsrld $0x3,%%ymm0,%%ymm1 \n"
  726. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  727. "vpand %%ymm4,%%ymm2,%%ymm2 \n"
  728. "vpand %%ymm3,%%ymm1,%%ymm1 \n"
  729. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  730. "vpor %%ymm2,%%ymm1,%%ymm1 \n"
  731. "vpor %%ymm1,%%ymm0,%%ymm0 \n"
  732. "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
  733. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  734. "lea 0x20(%0),%0 \n"
  735. "vmovdqu %%xmm0,(%1) \n"
  736. "lea 0x10(%1),%1 \n"
  737. "sub $0x8,%2 \n"
  738. "jg 1b \n"
  739. "vzeroupper \n"
  740. : "+r"(src), // %0
  741. "+r"(dst), // %1
  742. "+r"(width) // %2
  743. : "m"(dither4) // %3
  744. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  745. "xmm7");
  746. }
  747. #endif // HAS_ARGBTORGB565DITHERROW_AVX2
  748. void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  749. asm volatile(
  750. "pcmpeqb %%xmm4,%%xmm4 \n"
  751. "psrld $0x1b,%%xmm4 \n"
  752. "movdqa %%xmm4,%%xmm5 \n"
  753. "pslld $0x5,%%xmm5 \n"
  754. "movdqa %%xmm4,%%xmm6 \n"
  755. "pslld $0xa,%%xmm6 \n"
  756. "pcmpeqb %%xmm7,%%xmm7 \n"
  757. "pslld $0xf,%%xmm7 \n"
  758. LABELALIGN
  759. "1: \n"
  760. "movdqu (%0),%%xmm0 \n"
  761. "movdqa %%xmm0,%%xmm1 \n"
  762. "movdqa %%xmm0,%%xmm2 \n"
  763. "movdqa %%xmm0,%%xmm3 \n"
  764. "psrad $0x10,%%xmm0 \n"
  765. "psrld $0x3,%%xmm1 \n"
  766. "psrld $0x6,%%xmm2 \n"
  767. "psrld $0x9,%%xmm3 \n"
  768. "pand %%xmm7,%%xmm0 \n"
  769. "pand %%xmm4,%%xmm1 \n"
  770. "pand %%xmm5,%%xmm2 \n"
  771. "pand %%xmm6,%%xmm3 \n"
  772. "por %%xmm1,%%xmm0 \n"
  773. "por %%xmm3,%%xmm2 \n"
  774. "por %%xmm2,%%xmm0 \n"
  775. "packssdw %%xmm0,%%xmm0 \n"
  776. "lea 0x10(%0),%0 \n"
  777. "movq %%xmm0,(%1) \n"
  778. "lea 0x8(%1),%1 \n"
  779. "sub $0x4,%2 \n"
  780. "jg 1b \n"
  781. : "+r"(src), // %0
  782. "+r"(dst), // %1
  783. "+r"(width) // %2
  784. ::"memory",
  785. "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
  786. }
  787. void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  788. asm volatile(
  789. "pcmpeqb %%xmm4,%%xmm4 \n"
  790. "psllw $0xc,%%xmm4 \n"
  791. "movdqa %%xmm4,%%xmm3 \n"
  792. "psrlw $0x8,%%xmm3 \n"
  793. LABELALIGN
  794. "1: \n"
  795. "movdqu (%0),%%xmm0 \n"
  796. "movdqa %%xmm0,%%xmm1 \n"
  797. "pand %%xmm3,%%xmm0 \n"
  798. "pand %%xmm4,%%xmm1 \n"
  799. "psrlq $0x4,%%xmm0 \n"
  800. "psrlq $0x8,%%xmm1 \n"
  801. "por %%xmm1,%%xmm0 \n"
  802. "packuswb %%xmm0,%%xmm0 \n"
  803. "lea 0x10(%0),%0 \n"
  804. "movq %%xmm0,(%1) \n"
  805. "lea 0x8(%1),%1 \n"
  806. "sub $0x4,%2 \n"
  807. "jg 1b \n"
  808. : "+r"(src), // %0
  809. "+r"(dst), // %1
  810. "+r"(width) // %2
  811. ::"memory",
  812. "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
  813. }
  814. #endif // HAS_RGB24TOARGBROW_SSSE3
  815. /*
  816. ARGBToAR30Row:
  817. Red Blue
  818. With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
  819. produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
  820. wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
  821. (1024+4)*16 for red.
  822. Alpha Green
  823. Alpha and Green are already in the high bits so vpand can zero out the other
  824. bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
  825. could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha
  826. would be a simple multiplier to shift it into position. It wants a gap of 10
  827. above the green. Green is 10 bits, so there are 6 bits in the low short. 4
  828. more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
  829. and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the
  830. result left 10 to position the A and G channels.
  831. */
  832. // Shuffle table for converting RAW to RGB24. Last 8.
  833. static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u,
  834. 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
  835. static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u,
  836. 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
  837. static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
  838. static const uint32_t kMaskRB10 = 0x3ff003ff;
  839. static const uint32_t kMaskAG10 = 0xc000ff00;
  840. static const uint32_t kMulAG10 = 64 * 65536 + 1028;
  841. void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  842. asm volatile(
  843. "movdqa %3,%%xmm2 \n" // shuffler for RB
  844. "movd %4,%%xmm3 \n" // multipler for RB
  845. "movd %5,%%xmm4 \n" // mask for R10 B10
  846. "movd %6,%%xmm5 \n" // mask for AG
  847. "movd %7,%%xmm6 \n" // multipler for AG
  848. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  849. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  850. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  851. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  852. "sub %0,%1 \n"
  853. "1: \n"
  854. "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
  855. "movdqa %%xmm0,%%xmm1 \n"
  856. "pshufb %%xmm2,%%xmm1 \n" // R0B0
  857. "pand %%xmm5,%%xmm0 \n" // A0G0
  858. "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
  859. "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
  860. "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
  861. "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
  862. "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
  863. "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
  864. "add $0x10,%0 \n"
  865. "sub $0x4,%2 \n"
  866. "jg 1b \n"
  867. : "+r"(src), // %0
  868. "+r"(dst), // %1
  869. "+r"(width) // %2
  870. : "m"(kShuffleRB30), // %3
  871. "m"(kMulRB10), // %4
  872. "m"(kMaskRB10), // %5
  873. "m"(kMaskAG10), // %6
  874. "m"(kMulAG10) // %7
  875. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  876. }
  877. void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  878. asm volatile(
  879. "movdqa %3,%%xmm2 \n" // shuffler for RB
  880. "movd %4,%%xmm3 \n" // multipler for RB
  881. "movd %5,%%xmm4 \n" // mask for R10 B10
  882. "movd %6,%%xmm5 \n" // mask for AG
  883. "movd %7,%%xmm6 \n" // multipler for AG
  884. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  885. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  886. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  887. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  888. "sub %0,%1 \n"
  889. "1: \n"
  890. "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
  891. "movdqa %%xmm0,%%xmm1 \n"
  892. "pshufb %%xmm2,%%xmm1 \n" // R0B0
  893. "pand %%xmm5,%%xmm0 \n" // A0G0
  894. "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
  895. "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
  896. "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
  897. "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
  898. "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
  899. "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
  900. "add $0x10,%0 \n"
  901. "sub $0x4,%2 \n"
  902. "jg 1b \n"
  903. : "+r"(src), // %0
  904. "+r"(dst), // %1
  905. "+r"(width) // %2
  906. : "m"(kShuffleBR30), // %3 reversed shuffler
  907. "m"(kMulRB10), // %4
  908. "m"(kMaskRB10), // %5
  909. "m"(kMaskAG10), // %6
  910. "m"(kMulAG10) // %7
  911. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  912. }
  913. #ifdef HAS_ARGBTOAR30ROW_AVX2
  914. void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  915. asm volatile(
  916. "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
  917. "vbroadcastss %4,%%ymm3 \n" // multipler for RB
  918. "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
  919. "vbroadcastss %6,%%ymm5 \n" // mask for AG
  920. "vbroadcastss %7,%%ymm6 \n" // multipler for AG
  921. "sub %0,%1 \n"
  922. "1: \n"
  923. "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
  924. "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
  925. "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
  926. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
  927. "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
  928. "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
  929. "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
  930. "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
  931. "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
  932. "add $0x20,%0 \n"
  933. "sub $0x8,%2 \n"
  934. "jg 1b \n"
  935. "vzeroupper \n"
  936. : "+r"(src), // %0
  937. "+r"(dst), // %1
  938. "+r"(width) // %2
  939. : "m"(kShuffleRB30), // %3
  940. "m"(kMulRB10), // %4
  941. "m"(kMaskRB10), // %5
  942. "m"(kMaskAG10), // %6
  943. "m"(kMulAG10) // %7
  944. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  945. }
  946. #endif
  947. #ifdef HAS_ABGRTOAR30ROW_AVX2
  948. void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  949. asm volatile(
  950. "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
  951. "vbroadcastss %4,%%ymm3 \n" // multipler for RB
  952. "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
  953. "vbroadcastss %6,%%ymm5 \n" // mask for AG
  954. "vbroadcastss %7,%%ymm6 \n" // multipler for AG
  955. "sub %0,%1 \n"
  956. "1: \n"
  957. "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
  958. "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
  959. "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
  960. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
  961. "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
  962. "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
  963. "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
  964. "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
  965. "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
  966. "add $0x20,%0 \n"
  967. "sub $0x8,%2 \n"
  968. "jg 1b \n"
  969. "vzeroupper \n"
  970. : "+r"(src), // %0
  971. "+r"(dst), // %1
  972. "+r"(width) // %2
  973. : "m"(kShuffleBR30), // %3 reversed shuffler
  974. "m"(kMulRB10), // %4
  975. "m"(kMaskRB10), // %5
  976. "m"(kMaskAG10), // %6
  977. "m"(kMulAG10) // %7
  978. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  979. }
  980. #endif
  981. // clang-format off
  982. // TODO(mraptis): Consider passing R, G, B multipliers as parameter.
  983. // round parameter is register containing value to add before shift.
  984. #define RGBTOY(round) \
  985. "1: \n" \
  986. "movdqu (%0),%%xmm0 \n" \
  987. "movdqu 0x10(%0),%%xmm1 \n" \
  988. "movdqu 0x20(%0),%%xmm2 \n" \
  989. "movdqu 0x30(%0),%%xmm3 \n" \
  990. "psubb %%xmm5,%%xmm0 \n" \
  991. "psubb %%xmm5,%%xmm1 \n" \
  992. "psubb %%xmm5,%%xmm2 \n" \
  993. "psubb %%xmm5,%%xmm3 \n" \
  994. "movdqu %%xmm4,%%xmm6 \n" \
  995. "pmaddubsw %%xmm0,%%xmm6 \n" \
  996. "movdqu %%xmm4,%%xmm0 \n" \
  997. "pmaddubsw %%xmm1,%%xmm0 \n" \
  998. "movdqu %%xmm4,%%xmm1 \n" \
  999. "pmaddubsw %%xmm2,%%xmm1 \n" \
  1000. "movdqu %%xmm4,%%xmm2 \n" \
  1001. "pmaddubsw %%xmm3,%%xmm2 \n" \
  1002. "lea 0x40(%0),%0 \n" \
  1003. "phaddw %%xmm0,%%xmm6 \n" \
  1004. "phaddw %%xmm2,%%xmm1 \n" \
  1005. "prefetcht0 1280(%0) \n" \
  1006. "paddw %%" #round ",%%xmm6 \n" \
  1007. "paddw %%" #round ",%%xmm1 \n" \
  1008. "psrlw $0x8,%%xmm6 \n" \
  1009. "psrlw $0x8,%%xmm1 \n" \
  1010. "packuswb %%xmm1,%%xmm6 \n" \
  1011. "movdqu %%xmm6,(%1) \n" \
  1012. "lea 0x10(%1),%1 \n" \
  1013. "sub $0x10,%2 \n" \
  1014. "jg 1b \n"
  1015. #define RGBTOY_AVX2(round) \
  1016. "1: \n" \
  1017. "vmovdqu (%0),%%ymm0 \n" \
  1018. "vmovdqu 0x20(%0),%%ymm1 \n" \
  1019. "vmovdqu 0x40(%0),%%ymm2 \n" \
  1020. "vmovdqu 0x60(%0),%%ymm3 \n" \
  1021. "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
  1022. "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
  1023. "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
  1024. "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
  1025. "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
  1026. "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
  1027. "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
  1028. "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
  1029. "lea 0x80(%0),%0 \n" \
  1030. "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
  1031. "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
  1032. "prefetcht0 1280(%0) \n" \
  1033. "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
  1034. "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
  1035. "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
  1036. "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
  1037. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
  1038. "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
  1039. "vmovdqu %%ymm0,(%1) \n" \
  1040. "lea 0x20(%1),%1 \n" \
  1041. "sub $0x20,%2 \n" \
  1042. "jg 1b \n" \
  1043. "vzeroupper \n"
  1044. // clang-format on
  1045. #ifdef HAS_ARGBTOYROW_SSSE3
  1046. // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
  1047. void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  1048. asm volatile(
  1049. "movdqa %3,%%xmm4 \n"
  1050. "movdqa %4,%%xmm5 \n"
  1051. "movdqa %5,%%xmm7 \n"
  1052. LABELALIGN RGBTOY(xmm7)
  1053. : "+r"(src_argb), // %0
  1054. "+r"(dst_y), // %1
  1055. "+r"(width) // %2
  1056. : "m"(kARGBToY), // %3
  1057. "m"(kSub128), // %4
  1058. "m"(kAddY16) // %5
  1059. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  1060. "xmm7");
  1061. }
  1062. #endif // HAS_ARGBTOYROW_SSSE3
  1063. #ifdef HAS_ARGBTOYJROW_SSSE3
  1064. // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
  1065. // Same as ARGBToYRow but different coefficients, no add 16.
  1066. void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  1067. asm volatile(
  1068. "movdqa %3,%%xmm4 \n"
  1069. "movdqa %4,%%xmm5 \n"
  1070. LABELALIGN RGBTOY(xmm5)
  1071. : "+r"(src_argb), // %0
  1072. "+r"(dst_y), // %1
  1073. "+r"(width) // %2
  1074. : "m"(kARGBToYJ), // %3
  1075. "m"(kSub128) // %4
  1076. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  1077. }
  1078. #endif // HAS_ARGBTOYJROW_SSSE3
  1079. #ifdef HAS_RGBATOYJROW_SSSE3
  1080. // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
  1081. // Same as ARGBToYRow but different coefficients, no add 16.
  1082. void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
  1083. asm volatile(
  1084. "movdqa %3,%%xmm4 \n"
  1085. "movdqa %4,%%xmm5 \n"
  1086. LABELALIGN RGBTOY(xmm5)
  1087. : "+r"(src_rgba), // %0
  1088. "+r"(dst_y), // %1
  1089. "+r"(width) // %2
  1090. : "m"(kRGBAToYJ), // %3
  1091. "m"(kSub128) // %4
  1092. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  1093. }
  1094. #endif // HAS_RGBATOYJROW_SSSE3
  1095. #ifdef HAS_ARGBTOYROW_AVX2
  1096. // vpermd for vphaddw + vpackuswb vpermd.
  1097. static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
  1098. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  1099. void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  1100. asm volatile(
  1101. "vbroadcastf128 %3,%%ymm4 \n"
  1102. "vbroadcastf128 %4,%%ymm5 \n"
  1103. "vbroadcastf128 %5,%%ymm7 \n"
  1104. "vmovdqu %6,%%ymm6 \n"
  1105. LABELALIGN RGBTOY_AVX2(ymm7)
  1106. : "+r"(src_argb), // %0
  1107. "+r"(dst_y), // %1
  1108. "+r"(width) // %2
  1109. : "m"(kARGBToY), // %3
  1110. "m"(kSub128), // %4
  1111. "m"(kAddY16), // %5
  1112. "m"(kPermdARGBToY_AVX) // %6
  1113. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  1114. "xmm7");
  1115. }
  1116. #endif // HAS_ARGBTOYROW_AVX2
  1117. #ifdef HAS_ABGRTOYROW_AVX2
  1118. // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
  1119. void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
  1120. asm volatile(
  1121. "vbroadcastf128 %3,%%ymm4 \n"
  1122. "vbroadcastf128 %4,%%ymm5 \n"
  1123. "vbroadcastf128 %5,%%ymm7 \n"
  1124. "vmovdqu %6,%%ymm6 \n"
  1125. LABELALIGN RGBTOY_AVX2(ymm7)
  1126. : "+r"(src_abgr), // %0
  1127. "+r"(dst_y), // %1
  1128. "+r"(width) // %2
  1129. : "m"(kABGRToY), // %3
  1130. "m"(kSub128), // %4
  1131. "m"(kAddY16), // %5
  1132. "m"(kPermdARGBToY_AVX) // %6
  1133. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  1134. "xmm7");
  1135. }
  1136. #endif // HAS_ABGRTOYROW_AVX2
  1137. #ifdef HAS_ARGBTOYJROW_AVX2
  1138. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  1139. void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  1140. asm volatile(
  1141. "vbroadcastf128 %3,%%ymm4 \n"
  1142. "vbroadcastf128 %4,%%ymm5 \n"
  1143. "vmovdqu %5,%%ymm6 \n"
  1144. LABELALIGN RGBTOY_AVX2(ymm5)
  1145. : "+r"(src_argb), // %0
  1146. "+r"(dst_y), // %1
  1147. "+r"(width) // %2
  1148. : "m"(kARGBToYJ), // %3
  1149. "m"(kSub128), // %4
  1150. "m"(kPermdARGBToY_AVX) // %5
  1151. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  1152. "xmm7");
  1153. }
  1154. #endif // HAS_ARGBTOYJROW_AVX2
  1155. #ifdef HAS_RGBATOYJROW_AVX2
  1156. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
  1157. void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
  1158. asm volatile(
  1159. "vbroadcastf128 %3,%%ymm4 \n"
  1160. "vbroadcastf128 %4,%%ymm5 \n"
  1161. "vmovdqu %5,%%ymm6 \n"
  1162. LABELALIGN RGBTOY_AVX2(
  1163. ymm5) "vzeroupper \n"
  1164. : "+r"(src_rgba), // %0
  1165. "+r"(dst_y), // %1
  1166. "+r"(width) // %2
  1167. : "m"(kRGBAToYJ), // %3
  1168. "m"(kSub128), // %4
  1169. "m"(kPermdARGBToY_AVX) // %5
  1170. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  1171. }
  1172. #endif // HAS_RGBATOYJROW_AVX2
  1173. #ifdef HAS_ARGBTOUVROW_SSSE3
  1174. void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
  1175. int src_stride_argb,
  1176. uint8_t* dst_u,
  1177. uint8_t* dst_v,
  1178. int width) {
  1179. asm volatile(
  1180. "movdqa %5,%%xmm3 \n"
  1181. "movdqa %6,%%xmm4 \n"
  1182. "movdqa %7,%%xmm5 \n"
  1183. "sub %1,%2 \n"
  1184. LABELALIGN
  1185. "1: \n"
  1186. "movdqu (%0),%%xmm0 \n"
  1187. "movdqu 0x00(%0,%4,1),%%xmm7 \n"
  1188. "pavgb %%xmm7,%%xmm0 \n"
  1189. "movdqu 0x10(%0),%%xmm1 \n"
  1190. "movdqu 0x10(%0,%4,1),%%xmm7 \n"
  1191. "pavgb %%xmm7,%%xmm1 \n"
  1192. "movdqu 0x20(%0),%%xmm2 \n"
  1193. "movdqu 0x20(%0,%4,1),%%xmm7 \n"
  1194. "pavgb %%xmm7,%%xmm2 \n"
  1195. "movdqu 0x30(%0),%%xmm6 \n"
  1196. "movdqu 0x30(%0,%4,1),%%xmm7 \n"
  1197. "pavgb %%xmm7,%%xmm6 \n"
  1198. "lea 0x40(%0),%0 \n"
  1199. "movdqa %%xmm0,%%xmm7 \n"
  1200. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1201. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1202. "pavgb %%xmm7,%%xmm0 \n"
  1203. "movdqa %%xmm2,%%xmm7 \n"
  1204. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1205. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1206. "pavgb %%xmm7,%%xmm2 \n"
  1207. "movdqa %%xmm0,%%xmm1 \n"
  1208. "movdqa %%xmm2,%%xmm6 \n"
  1209. "pmaddubsw %%xmm4,%%xmm0 \n"
  1210. "pmaddubsw %%xmm4,%%xmm2 \n"
  1211. "pmaddubsw %%xmm3,%%xmm1 \n"
  1212. "pmaddubsw %%xmm3,%%xmm6 \n"
  1213. "phaddw %%xmm2,%%xmm0 \n"
  1214. "phaddw %%xmm6,%%xmm1 \n"
  1215. "psraw $0x8,%%xmm0 \n"
  1216. "psraw $0x8,%%xmm1 \n"
  1217. "packsswb %%xmm1,%%xmm0 \n"
  1218. "paddb %%xmm5,%%xmm0 \n"
  1219. "movlps %%xmm0,(%1) \n"
  1220. "movhps %%xmm0,0x00(%1,%2,1) \n"
  1221. "lea 0x8(%1),%1 \n"
  1222. "sub $0x10,%3 \n"
  1223. "jg 1b \n"
  1224. : "+r"(src_argb0), // %0
  1225. "+r"(dst_u), // %1
  1226. "+r"(dst_v), // %2
  1227. "+rm"(width) // %3
  1228. : "r"((intptr_t)(src_stride_argb)), // %4
  1229. "m"(kARGBToV), // %5
  1230. "m"(kARGBToU), // %6
  1231. "m"(kAddUV128) // %7
  1232. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
  1233. }
  1234. #endif // HAS_ARGBTOUVROW_SSSE3
  1235. #ifdef HAS_ARGBTOUVROW_AVX2
  1236. // vpshufb for vphaddw + vpackuswb packed to shorts.
  1237. static const lvec8 kShufARGBToUV_AVX = {
  1238. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
  1239. 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
  1240. void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
  1241. int src_stride_argb,
  1242. uint8_t* dst_u,
  1243. uint8_t* dst_v,
  1244. int width) {
  1245. asm volatile(
  1246. "vbroadcastf128 %5,%%ymm5 \n"
  1247. "vbroadcastf128 %6,%%ymm6 \n"
  1248. "vbroadcastf128 %7,%%ymm7 \n"
  1249. "sub %1,%2 \n"
  1250. LABELALIGN
  1251. "1: \n"
  1252. "vmovdqu (%0),%%ymm0 \n"
  1253. "vmovdqu 0x20(%0),%%ymm1 \n"
  1254. "vmovdqu 0x40(%0),%%ymm2 \n"
  1255. "vmovdqu 0x60(%0),%%ymm3 \n"
  1256. "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
  1257. "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
  1258. "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
  1259. "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
  1260. "lea 0x80(%0),%0 \n"
  1261. "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
  1262. "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
  1263. "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
  1264. "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
  1265. "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
  1266. "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
  1267. "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
  1268. "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
  1269. "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
  1270. "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
  1271. "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
  1272. "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
  1273. "vpsraw $0x8,%%ymm1,%%ymm1 \n"
  1274. "vpsraw $0x8,%%ymm0,%%ymm0 \n"
  1275. "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
  1276. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  1277. "vpshufb %8,%%ymm0,%%ymm0 \n"
  1278. "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
  1279. "vextractf128 $0x0,%%ymm0,(%1) \n"
  1280. "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
  1281. "lea 0x10(%1),%1 \n"
  1282. "sub $0x20,%3 \n"
  1283. "jg 1b \n"
  1284. "vzeroupper \n"
  1285. : "+r"(src_argb0), // %0
  1286. "+r"(dst_u), // %1
  1287. "+r"(dst_v), // %2
  1288. "+rm"(width) // %3
  1289. : "r"((intptr_t)(src_stride_argb)), // %4
  1290. "m"(kAddUV128), // %5
  1291. "m"(kARGBToV), // %6
  1292. "m"(kARGBToU), // %7
  1293. "m"(kShufARGBToUV_AVX) // %8
  1294. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  1295. "xmm7");
  1296. }
  1297. #endif // HAS_ARGBTOUVROW_AVX2
  1298. #ifdef HAS_ABGRTOUVROW_AVX2
  1299. void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
  1300. int src_stride_abgr,
  1301. uint8_t* dst_u,
  1302. uint8_t* dst_v,
  1303. int width) {
  1304. asm volatile(
  1305. "vbroadcastf128 %5,%%ymm5 \n"
  1306. "vbroadcastf128 %6,%%ymm6 \n"
  1307. "vbroadcastf128 %7,%%ymm7 \n"
  1308. "sub %1,%2 \n"
  1309. LABELALIGN
  1310. "1: \n"
  1311. "vmovdqu (%0),%%ymm0 \n"
  1312. "vmovdqu 0x20(%0),%%ymm1 \n"
  1313. "vmovdqu 0x40(%0),%%ymm2 \n"
  1314. "vmovdqu 0x60(%0),%%ymm3 \n"
  1315. "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
  1316. "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
  1317. "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
  1318. "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
  1319. "lea 0x80(%0),%0 \n"
  1320. "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
  1321. "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
  1322. "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
  1323. "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
  1324. "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
  1325. "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
  1326. "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
  1327. "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
  1328. "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
  1329. "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
  1330. "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
  1331. "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
  1332. "vpsraw $0x8,%%ymm1,%%ymm1 \n"
  1333. "vpsraw $0x8,%%ymm0,%%ymm0 \n"
  1334. "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
  1335. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  1336. "vpshufb %8,%%ymm0,%%ymm0 \n"
  1337. "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
  1338. "vextractf128 $0x0,%%ymm0,(%1) \n"
  1339. "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
  1340. "lea 0x10(%1),%1 \n"
  1341. "sub $0x20,%3 \n"
  1342. "jg 1b \n"
  1343. "vzeroupper \n"
  1344. : "+r"(src_abgr0), // %0
  1345. "+r"(dst_u), // %1
  1346. "+r"(dst_v), // %2
  1347. "+rm"(width) // %3
  1348. : "r"((intptr_t)(src_stride_abgr)), // %4
  1349. "m"(kAddUV128), // %5
  1350. "m"(kABGRToV), // %6
  1351. "m"(kABGRToU), // %7
  1352. "m"(kShufARGBToUV_AVX) // %8
  1353. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  1354. "xmm7");
  1355. }
  1356. #endif // HAS_ABGRTOUVROW_AVX2
  1357. #ifdef HAS_ARGBTOUVJROW_AVX2
  1358. void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
  1359. int src_stride_argb,
  1360. uint8_t* dst_u,
  1361. uint8_t* dst_v,
  1362. int width) {
  1363. asm volatile(
  1364. "vbroadcastf128 %5,%%ymm5 \n"
  1365. "vbroadcastf128 %6,%%ymm6 \n"
  1366. "vbroadcastf128 %7,%%ymm7 \n"
  1367. "sub %1,%2 \n"
  1368. LABELALIGN
  1369. "1: \n"
  1370. "vmovdqu (%0),%%ymm0 \n"
  1371. "vmovdqu 0x20(%0),%%ymm1 \n"
  1372. "vmovdqu 0x40(%0),%%ymm2 \n"
  1373. "vmovdqu 0x60(%0),%%ymm3 \n"
  1374. "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
  1375. "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
  1376. "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
  1377. "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
  1378. "lea 0x80(%0),%0 \n"
  1379. "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
  1380. "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
  1381. "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
  1382. "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
  1383. "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
  1384. "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
  1385. "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
  1386. "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
  1387. "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
  1388. "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
  1389. "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
  1390. "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
  1391. "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
  1392. "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
  1393. "vpsraw $0x8,%%ymm1,%%ymm1 \n"
  1394. "vpsraw $0x8,%%ymm0,%%ymm0 \n"
  1395. "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
  1396. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  1397. "vpshufb %8,%%ymm0,%%ymm0 \n"
  1398. "vextractf128 $0x0,%%ymm0,(%1) \n"
  1399. "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
  1400. "lea 0x10(%1),%1 \n"
  1401. "sub $0x20,%3 \n"
  1402. "jg 1b \n"
  1403. "vzeroupper \n"
  1404. : "+r"(src_argb0), // %0
  1405. "+r"(dst_u), // %1
  1406. "+r"(dst_v), // %2
  1407. "+rm"(width) // %3
  1408. : "r"((intptr_t)(src_stride_argb)), // %4
  1409. "m"(kSub128), // %5
  1410. "m"(kARGBToVJ), // %6
  1411. "m"(kARGBToUJ), // %7
  1412. "m"(kShufARGBToUV_AVX) // %8
  1413. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  1414. "xmm7");
  1415. }
  1416. #endif // HAS_ARGBTOUVJROW_AVX2
  1417. #ifdef HAS_ARGBTOUVJROW_SSSE3
  1418. void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
  1419. int src_stride_argb,
  1420. uint8_t* dst_u,
  1421. uint8_t* dst_v,
  1422. int width) {
  1423. asm volatile(
  1424. "movdqa %5,%%xmm3 \n"
  1425. "movdqa %6,%%xmm4 \n"
  1426. "movdqa %7,%%xmm5 \n"
  1427. "sub %1,%2 \n"
  1428. LABELALIGN
  1429. "1: \n"
  1430. "movdqu (%0),%%xmm0 \n"
  1431. "movdqu 0x00(%0,%4,1),%%xmm7 \n"
  1432. "pavgb %%xmm7,%%xmm0 \n"
  1433. "movdqu 0x10(%0),%%xmm1 \n"
  1434. "movdqu 0x10(%0,%4,1),%%xmm7 \n"
  1435. "pavgb %%xmm7,%%xmm1 \n"
  1436. "movdqu 0x20(%0),%%xmm2 \n"
  1437. "movdqu 0x20(%0,%4,1),%%xmm7 \n"
  1438. "pavgb %%xmm7,%%xmm2 \n"
  1439. "movdqu 0x30(%0),%%xmm6 \n"
  1440. "movdqu 0x30(%0,%4,1),%%xmm7 \n"
  1441. "pavgb %%xmm7,%%xmm6 \n"
  1442. "lea 0x40(%0),%0 \n"
  1443. "movdqa %%xmm0,%%xmm7 \n"
  1444. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1445. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1446. "pavgb %%xmm7,%%xmm0 \n"
  1447. "movdqa %%xmm2,%%xmm7 \n"
  1448. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1449. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1450. "pavgb %%xmm7,%%xmm2 \n"
  1451. "movdqa %%xmm0,%%xmm1 \n"
  1452. "movdqa %%xmm2,%%xmm6 \n"
  1453. "pmaddubsw %%xmm4,%%xmm0 \n"
  1454. "pmaddubsw %%xmm4,%%xmm2 \n"
  1455. "pmaddubsw %%xmm3,%%xmm1 \n"
  1456. "pmaddubsw %%xmm3,%%xmm6 \n"
  1457. "phaddw %%xmm2,%%xmm0 \n"
  1458. "phaddw %%xmm6,%%xmm1 \n"
  1459. "paddw %%xmm5,%%xmm0 \n"
  1460. "paddw %%xmm5,%%xmm1 \n"
  1461. "psraw $0x8,%%xmm0 \n"
  1462. "psraw $0x8,%%xmm1 \n"
  1463. "packsswb %%xmm1,%%xmm0 \n"
  1464. "movlps %%xmm0,(%1) \n"
  1465. "movhps %%xmm0,0x00(%1,%2,1) \n"
  1466. "lea 0x8(%1),%1 \n"
  1467. "sub $0x10,%3 \n"
  1468. "jg 1b \n"
  1469. : "+r"(src_argb0), // %0
  1470. "+r"(dst_u), // %1
  1471. "+r"(dst_v), // %2
  1472. "+rm"(width) // %3
  1473. : "r"((intptr_t)(src_stride_argb)), // %4
  1474. "m"(kARGBToVJ), // %5
  1475. "m"(kARGBToUJ), // %6
  1476. "m"(kSub128) // %7
  1477. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
  1478. }
  1479. #endif // HAS_ARGBTOUVJROW_SSSE3
  1480. #ifdef HAS_ARGBTOUV444ROW_SSSE3
  1481. void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
  1482. uint8_t* dst_u,
  1483. uint8_t* dst_v,
  1484. int width) {
  1485. asm volatile(
  1486. "movdqa %4,%%xmm3 \n"
  1487. "movdqa %5,%%xmm4 \n"
  1488. "movdqa %6,%%xmm5 \n"
  1489. "sub %1,%2 \n"
  1490. LABELALIGN
  1491. "1: \n"
  1492. "movdqu (%0),%%xmm0 \n"
  1493. "movdqu 0x10(%0),%%xmm1 \n"
  1494. "movdqu 0x20(%0),%%xmm2 \n"
  1495. "movdqu 0x30(%0),%%xmm6 \n"
  1496. "pmaddubsw %%xmm4,%%xmm0 \n"
  1497. "pmaddubsw %%xmm4,%%xmm1 \n"
  1498. "pmaddubsw %%xmm4,%%xmm2 \n"
  1499. "pmaddubsw %%xmm4,%%xmm6 \n"
  1500. "phaddw %%xmm1,%%xmm0 \n"
  1501. "phaddw %%xmm6,%%xmm2 \n"
  1502. "psraw $0x8,%%xmm0 \n"
  1503. "psraw $0x8,%%xmm2 \n"
  1504. "packsswb %%xmm2,%%xmm0 \n"
  1505. "paddb %%xmm5,%%xmm0 \n"
  1506. "movdqu %%xmm0,(%1) \n"
  1507. "movdqu (%0),%%xmm0 \n"
  1508. "movdqu 0x10(%0),%%xmm1 \n"
  1509. "movdqu 0x20(%0),%%xmm2 \n"
  1510. "movdqu 0x30(%0),%%xmm6 \n"
  1511. "pmaddubsw %%xmm3,%%xmm0 \n"
  1512. "pmaddubsw %%xmm3,%%xmm1 \n"
  1513. "pmaddubsw %%xmm3,%%xmm2 \n"
  1514. "pmaddubsw %%xmm3,%%xmm6 \n"
  1515. "phaddw %%xmm1,%%xmm0 \n"
  1516. "phaddw %%xmm6,%%xmm2 \n"
  1517. "psraw $0x8,%%xmm0 \n"
  1518. "psraw $0x8,%%xmm2 \n"
  1519. "packsswb %%xmm2,%%xmm0 \n"
  1520. "paddb %%xmm5,%%xmm0 \n"
  1521. "lea 0x40(%0),%0 \n"
  1522. "movdqu %%xmm0,0x00(%1,%2,1) \n"
  1523. "lea 0x10(%1),%1 \n"
  1524. "sub $0x10,%3 \n"
  1525. "jg 1b \n"
  1526. : "+r"(src_argb), // %0
  1527. "+r"(dst_u), // %1
  1528. "+r"(dst_v), // %2
  1529. "+rm"(width) // %3
  1530. : "m"(kARGBToV), // %4
  1531. "m"(kARGBToU), // %5
  1532. "m"(kAddUV128) // %6
  1533. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
  1534. }
  1535. #endif // HAS_ARGBTOUV444ROW_SSSE3
  1536. void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
  1537. asm volatile(
  1538. "movdqa %3,%%xmm4 \n"
  1539. "movdqa %4,%%xmm5 \n"
  1540. "movdqa %5,%%xmm7 \n"
  1541. LABELALIGN RGBTOY(xmm7)
  1542. : "+r"(src_bgra), // %0
  1543. "+r"(dst_y), // %1
  1544. "+r"(width) // %2
  1545. : "m"(kBGRAToY), // %3
  1546. "m"(kSub128), // %4
  1547. "m"(kAddY16) // %5
  1548. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  1549. "xmm7");
  1550. }
  1551. void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
  1552. int src_stride_bgra,
  1553. uint8_t* dst_u,
  1554. uint8_t* dst_v,
  1555. int width) {
  1556. asm volatile(
  1557. "movdqa %5,%%xmm3 \n"
  1558. "movdqa %6,%%xmm4 \n"
  1559. "movdqa %7,%%xmm5 \n"
  1560. "sub %1,%2 \n"
  1561. LABELALIGN
  1562. "1: \n"
  1563. "movdqu (%0),%%xmm0 \n"
  1564. "movdqu 0x00(%0,%4,1),%%xmm7 \n"
  1565. "pavgb %%xmm7,%%xmm0 \n"
  1566. "movdqu 0x10(%0),%%xmm1 \n"
  1567. "movdqu 0x10(%0,%4,1),%%xmm7 \n"
  1568. "pavgb %%xmm7,%%xmm1 \n"
  1569. "movdqu 0x20(%0),%%xmm2 \n"
  1570. "movdqu 0x20(%0,%4,1),%%xmm7 \n"
  1571. "pavgb %%xmm7,%%xmm2 \n"
  1572. "movdqu 0x30(%0),%%xmm6 \n"
  1573. "movdqu 0x30(%0,%4,1),%%xmm7 \n"
  1574. "pavgb %%xmm7,%%xmm6 \n"
  1575. "lea 0x40(%0),%0 \n"
  1576. "movdqa %%xmm0,%%xmm7 \n"
  1577. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1578. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1579. "pavgb %%xmm7,%%xmm0 \n"
  1580. "movdqa %%xmm2,%%xmm7 \n"
  1581. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1582. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1583. "pavgb %%xmm7,%%xmm2 \n"
  1584. "movdqa %%xmm0,%%xmm1 \n"
  1585. "movdqa %%xmm2,%%xmm6 \n"
  1586. "pmaddubsw %%xmm4,%%xmm0 \n"
  1587. "pmaddubsw %%xmm4,%%xmm2 \n"
  1588. "pmaddubsw %%xmm3,%%xmm1 \n"
  1589. "pmaddubsw %%xmm3,%%xmm6 \n"
  1590. "phaddw %%xmm2,%%xmm0 \n"
  1591. "phaddw %%xmm6,%%xmm1 \n"
  1592. "psraw $0x8,%%xmm0 \n"
  1593. "psraw $0x8,%%xmm1 \n"
  1594. "packsswb %%xmm1,%%xmm0 \n"
  1595. "paddb %%xmm5,%%xmm0 \n"
  1596. "movlps %%xmm0,(%1) \n"
  1597. "movhps %%xmm0,0x00(%1,%2,1) \n"
  1598. "lea 0x8(%1),%1 \n"
  1599. "sub $0x10,%3 \n"
  1600. "jg 1b \n"
  1601. : "+r"(src_bgra0), // %0
  1602. "+r"(dst_u), // %1
  1603. "+r"(dst_v), // %2
  1604. "+rm"(width) // %3
  1605. : "r"((intptr_t)(src_stride_bgra)), // %4
  1606. "m"(kBGRAToV), // %5
  1607. "m"(kBGRAToU), // %6
  1608. "m"(kAddUV128) // %7
  1609. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
  1610. }
  1611. void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
  1612. asm volatile(
  1613. "movdqa %3,%%xmm4 \n"
  1614. "movdqa %4,%%xmm5 \n"
  1615. "movdqa %5,%%xmm7 \n"
  1616. LABELALIGN RGBTOY(xmm7)
  1617. : "+r"(src_abgr), // %0
  1618. "+r"(dst_y), // %1
  1619. "+r"(width) // %2
  1620. : "m"(kABGRToY), // %3
  1621. "m"(kSub128), // %4
  1622. "m"(kAddY16) // %5
  1623. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  1624. "xmm7");
  1625. }
  1626. void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
  1627. asm volatile(
  1628. "movdqa %3,%%xmm4 \n"
  1629. "movdqa %4,%%xmm5 \n"
  1630. "movdqa %5,%%xmm7 \n"
  1631. LABELALIGN RGBTOY(xmm7)
  1632. : "+r"(src_rgba), // %0
  1633. "+r"(dst_y), // %1
  1634. "+r"(width) // %2
  1635. : "m"(kRGBAToY), // %3
  1636. "m"(kSub128), // %4
  1637. "m"(kAddY16) // %5
  1638. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  1639. "xmm7");
  1640. }
  1641. void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
  1642. int src_stride_abgr,
  1643. uint8_t* dst_u,
  1644. uint8_t* dst_v,
  1645. int width) {
  1646. asm volatile(
  1647. "movdqa %5,%%xmm3 \n"
  1648. "movdqa %6,%%xmm4 \n"
  1649. "movdqa %7,%%xmm5 \n"
  1650. "sub %1,%2 \n"
  1651. LABELALIGN
  1652. "1: \n"
  1653. "movdqu (%0),%%xmm0 \n"
  1654. "movdqu 0x00(%0,%4,1),%%xmm7 \n"
  1655. "pavgb %%xmm7,%%xmm0 \n"
  1656. "movdqu 0x10(%0),%%xmm1 \n"
  1657. "movdqu 0x10(%0,%4,1),%%xmm7 \n"
  1658. "pavgb %%xmm7,%%xmm1 \n"
  1659. "movdqu 0x20(%0),%%xmm2 \n"
  1660. "movdqu 0x20(%0,%4,1),%%xmm7 \n"
  1661. "pavgb %%xmm7,%%xmm2 \n"
  1662. "movdqu 0x30(%0),%%xmm6 \n"
  1663. "movdqu 0x30(%0,%4,1),%%xmm7 \n"
  1664. "pavgb %%xmm7,%%xmm6 \n"
  1665. "lea 0x40(%0),%0 \n"
  1666. "movdqa %%xmm0,%%xmm7 \n"
  1667. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1668. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1669. "pavgb %%xmm7,%%xmm0 \n"
  1670. "movdqa %%xmm2,%%xmm7 \n"
  1671. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1672. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1673. "pavgb %%xmm7,%%xmm2 \n"
  1674. "movdqa %%xmm0,%%xmm1 \n"
  1675. "movdqa %%xmm2,%%xmm6 \n"
  1676. "pmaddubsw %%xmm4,%%xmm0 \n"
  1677. "pmaddubsw %%xmm4,%%xmm2 \n"
  1678. "pmaddubsw %%xmm3,%%xmm1 \n"
  1679. "pmaddubsw %%xmm3,%%xmm6 \n"
  1680. "phaddw %%xmm2,%%xmm0 \n"
  1681. "phaddw %%xmm6,%%xmm1 \n"
  1682. "psraw $0x8,%%xmm0 \n"
  1683. "psraw $0x8,%%xmm1 \n"
  1684. "packsswb %%xmm1,%%xmm0 \n"
  1685. "paddb %%xmm5,%%xmm0 \n"
  1686. "movlps %%xmm0,(%1) \n"
  1687. "movhps %%xmm0,0x00(%1,%2,1) \n"
  1688. "lea 0x8(%1),%1 \n"
  1689. "sub $0x10,%3 \n"
  1690. "jg 1b \n"
  1691. : "+r"(src_abgr0), // %0
  1692. "+r"(dst_u), // %1
  1693. "+r"(dst_v), // %2
  1694. "+rm"(width) // %3
  1695. : "r"((intptr_t)(src_stride_abgr)), // %4
  1696. "m"(kABGRToV), // %5
  1697. "m"(kABGRToU), // %6
  1698. "m"(kAddUV128) // %7
  1699. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
  1700. }
  1701. void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
  1702. int src_stride_rgba,
  1703. uint8_t* dst_u,
  1704. uint8_t* dst_v,
  1705. int width) {
  1706. asm volatile(
  1707. "movdqa %5,%%xmm3 \n"
  1708. "movdqa %6,%%xmm4 \n"
  1709. "movdqa %7,%%xmm5 \n"
  1710. "sub %1,%2 \n"
  1711. LABELALIGN
  1712. "1: \n"
  1713. "movdqu (%0),%%xmm0 \n"
  1714. "movdqu 0x00(%0,%4,1),%%xmm7 \n"
  1715. "pavgb %%xmm7,%%xmm0 \n"
  1716. "movdqu 0x10(%0),%%xmm1 \n"
  1717. "movdqu 0x10(%0,%4,1),%%xmm7 \n"
  1718. "pavgb %%xmm7,%%xmm1 \n"
  1719. "movdqu 0x20(%0),%%xmm2 \n"
  1720. "movdqu 0x20(%0,%4,1),%%xmm7 \n"
  1721. "pavgb %%xmm7,%%xmm2 \n"
  1722. "movdqu 0x30(%0),%%xmm6 \n"
  1723. "movdqu 0x30(%0,%4,1),%%xmm7 \n"
  1724. "pavgb %%xmm7,%%xmm6 \n"
  1725. "lea 0x40(%0),%0 \n"
  1726. "movdqa %%xmm0,%%xmm7 \n"
  1727. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1728. "shufps $0xdd,%%xmm1,%%xmm7 \n"
  1729. "pavgb %%xmm7,%%xmm0 \n"
  1730. "movdqa %%xmm2,%%xmm7 \n"
  1731. "shufps $0x88,%%xmm6,%%xmm2 \n"
  1732. "shufps $0xdd,%%xmm6,%%xmm7 \n"
  1733. "pavgb %%xmm7,%%xmm2 \n"
  1734. "movdqa %%xmm0,%%xmm1 \n"
  1735. "movdqa %%xmm2,%%xmm6 \n"
  1736. "pmaddubsw %%xmm4,%%xmm0 \n"
  1737. "pmaddubsw %%xmm4,%%xmm2 \n"
  1738. "pmaddubsw %%xmm3,%%xmm1 \n"
  1739. "pmaddubsw %%xmm3,%%xmm6 \n"
  1740. "phaddw %%xmm2,%%xmm0 \n"
  1741. "phaddw %%xmm6,%%xmm1 \n"
  1742. "psraw $0x8,%%xmm0 \n"
  1743. "psraw $0x8,%%xmm1 \n"
  1744. "packsswb %%xmm1,%%xmm0 \n"
  1745. "paddb %%xmm5,%%xmm0 \n"
  1746. "movlps %%xmm0,(%1) \n"
  1747. "movhps %%xmm0,0x00(%1,%2,1) \n"
  1748. "lea 0x8(%1),%1 \n"
  1749. "sub $0x10,%3 \n"
  1750. "jg 1b \n"
  1751. : "+r"(src_rgba0), // %0
  1752. "+r"(dst_u), // %1
  1753. "+r"(dst_v), // %2
  1754. "+rm"(width) // %3
  1755. : "r"((intptr_t)(src_stride_rgba)), // %4
  1756. "m"(kRGBAToV), // %5
  1757. "m"(kRGBAToU), // %6
  1758. "m"(kAddUV128) // %7
  1759. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
  1760. }
  1761. #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
  1762. // Read 8 UV from 444
  1763. #define READYUV444 \
  1764. "movq (%[u_buf]),%%xmm0 \n" \
  1765. "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  1766. "lea 0x8(%[u_buf]),%[u_buf] \n" \
  1767. "punpcklbw %%xmm1,%%xmm0 \n" \
  1768. "movq (%[y_buf]),%%xmm4 \n" \
  1769. "punpcklbw %%xmm4,%%xmm4 \n" \
  1770. "lea 0x8(%[y_buf]),%[y_buf] \n"
  1771. // Read 4 UV from 422, upsample to 8 UV
  1772. #define READYUV422 \
  1773. "movd (%[u_buf]),%%xmm0 \n" \
  1774. "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  1775. "lea 0x4(%[u_buf]),%[u_buf] \n" \
  1776. "punpcklbw %%xmm1,%%xmm0 \n" \
  1777. "punpcklwd %%xmm0,%%xmm0 \n" \
  1778. "movq (%[y_buf]),%%xmm4 \n" \
  1779. "punpcklbw %%xmm4,%%xmm4 \n" \
  1780. "lea 0x8(%[y_buf]),%[y_buf] \n"
  1781. // Read 4 UV from 422 10 bit, upsample to 8 UV
  1782. // TODO(fbarchard): Consider shufb to replace pack/unpack
  1783. // TODO(fbarchard): Consider pmulhuw to replace psraw
  1784. // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
  1785. #define READYUV210 \
  1786. "movq (%[u_buf]),%%xmm0 \n" \
  1787. "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  1788. "lea 0x8(%[u_buf]),%[u_buf] \n" \
  1789. "punpcklwd %%xmm1,%%xmm0 \n" \
  1790. "psraw $0x2,%%xmm0 \n" \
  1791. "packuswb %%xmm0,%%xmm0 \n" \
  1792. "punpcklwd %%xmm0,%%xmm0 \n" \
  1793. "movdqu (%[y_buf]),%%xmm4 \n" \
  1794. "psllw $0x6,%%xmm4 \n" \
  1795. "lea 0x10(%[y_buf]),%[y_buf] \n"
  1796. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
  1797. #define READYUVA422 \
  1798. "movd (%[u_buf]),%%xmm0 \n" \
  1799. "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  1800. "lea 0x4(%[u_buf]),%[u_buf] \n" \
  1801. "punpcklbw %%xmm1,%%xmm0 \n" \
  1802. "punpcklwd %%xmm0,%%xmm0 \n" \
  1803. "movq (%[y_buf]),%%xmm4 \n" \
  1804. "punpcklbw %%xmm4,%%xmm4 \n" \
  1805. "lea 0x8(%[y_buf]),%[y_buf] \n" \
  1806. "movq (%[a_buf]),%%xmm5 \n" \
  1807. "lea 0x8(%[a_buf]),%[a_buf] \n"
  1808. // Read 4 UV from NV12, upsample to 8 UV
  1809. #define READNV12 \
  1810. "movq (%[uv_buf]),%%xmm0 \n" \
  1811. "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
  1812. "punpcklwd %%xmm0,%%xmm0 \n" \
  1813. "movq (%[y_buf]),%%xmm4 \n" \
  1814. "punpcklbw %%xmm4,%%xmm4 \n" \
  1815. "lea 0x8(%[y_buf]),%[y_buf] \n"
  1816. // Read 4 VU from NV21, upsample to 8 UV
  1817. #define READNV21 \
  1818. "movq (%[vu_buf]),%%xmm0 \n" \
  1819. "lea 0x8(%[vu_buf]),%[vu_buf] \n" \
  1820. "pshufb %[kShuffleNV21], %%xmm0 \n" \
  1821. "movq (%[y_buf]),%%xmm4 \n" \
  1822. "punpcklbw %%xmm4,%%xmm4 \n" \
  1823. "lea 0x8(%[y_buf]),%[y_buf] \n"
  1824. // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
  1825. #define READYUY2 \
  1826. "movdqu (%[yuy2_buf]),%%xmm4 \n" \
  1827. "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
  1828. "movdqu (%[yuy2_buf]),%%xmm0 \n" \
  1829. "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
  1830. "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n"
  1831. // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
  1832. #define READUYVY \
  1833. "movdqu (%[uyvy_buf]),%%xmm4 \n" \
  1834. "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
  1835. "movdqu (%[uyvy_buf]),%%xmm0 \n" \
  1836. "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
  1837. "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n"
  1838. #if defined(__x86_64__)
  1839. #define YUVTORGB_SETUP(yuvconstants) \
  1840. "movdqa (%[yuvconstants]),%%xmm8 \n" \
  1841. "movdqa 32(%[yuvconstants]),%%xmm9 \n" \
  1842. "movdqa 64(%[yuvconstants]),%%xmm10 \n" \
  1843. "movdqa 96(%[yuvconstants]),%%xmm11 \n" \
  1844. "movdqa 128(%[yuvconstants]),%%xmm12 \n" \
  1845. "movdqa 160(%[yuvconstants]),%%xmm13 \n" \
  1846. "movdqa 192(%[yuvconstants]),%%xmm14 \n"
  1847. // Convert 8 pixels: 8 UV and 8 Y
  1848. #define YUVTORGB16(yuvconstants) \
  1849. "movdqa %%xmm0,%%xmm1 \n" \
  1850. "movdqa %%xmm0,%%xmm2 \n" \
  1851. "movdqa %%xmm0,%%xmm3 \n" \
  1852. "movdqa %%xmm11,%%xmm0 \n" \
  1853. "pmaddubsw %%xmm8,%%xmm1 \n" \
  1854. "psubw %%xmm1,%%xmm0 \n" \
  1855. "movdqa %%xmm12,%%xmm1 \n" \
  1856. "pmaddubsw %%xmm9,%%xmm2 \n" \
  1857. "psubw %%xmm2,%%xmm1 \n" \
  1858. "movdqa %%xmm13,%%xmm2 \n" \
  1859. "pmaddubsw %%xmm10,%%xmm3 \n" \
  1860. "psubw %%xmm3,%%xmm2 \n" \
  1861. "pmulhuw %%xmm14,%%xmm4 \n" \
  1862. "paddsw %%xmm4,%%xmm0 \n" \
  1863. "paddsw %%xmm4,%%xmm1 \n" \
  1864. "paddsw %%xmm4,%%xmm2 \n"
  1865. #define YUVTORGB_REGS \
  1866. "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
  1867. #else
  1868. #define YUVTORGB_SETUP(yuvconstants)
  1869. // Convert 8 pixels: 8 UV and 8 Y
  1870. #define YUVTORGB16(yuvconstants) \
  1871. "movdqa %%xmm0,%%xmm1 \n" \
  1872. "movdqa %%xmm0,%%xmm2 \n" \
  1873. "movdqa %%xmm0,%%xmm3 \n" \
  1874. "movdqa 96(%[yuvconstants]),%%xmm0 \n" \
  1875. "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \
  1876. "psubw %%xmm1,%%xmm0 \n" \
  1877. "movdqa 128(%[yuvconstants]),%%xmm1 \n" \
  1878. "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \
  1879. "psubw %%xmm2,%%xmm1 \n" \
  1880. "movdqa 160(%[yuvconstants]),%%xmm2 \n" \
  1881. "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \
  1882. "psubw %%xmm3,%%xmm2 \n" \
  1883. "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \
  1884. "paddsw %%xmm4,%%xmm0 \n" \
  1885. "paddsw %%xmm4,%%xmm1 \n" \
  1886. "paddsw %%xmm4,%%xmm2 \n"
  1887. #define YUVTORGB_REGS
  1888. #endif
  1889. #define YUVTORGB(yuvconstants) \
  1890. YUVTORGB16(yuvconstants) \
  1891. "psraw $0x6,%%xmm0 \n" \
  1892. "psraw $0x6,%%xmm1 \n" \
  1893. "psraw $0x6,%%xmm2 \n" \
  1894. "packuswb %%xmm0,%%xmm0 \n" \
  1895. "packuswb %%xmm1,%%xmm1 \n" \
  1896. "packuswb %%xmm2,%%xmm2 \n"
  1897. // Store 8 ARGB values.
  1898. #define STOREARGB \
  1899. "punpcklbw %%xmm1,%%xmm0 \n" \
  1900. "punpcklbw %%xmm5,%%xmm2 \n" \
  1901. "movdqa %%xmm0,%%xmm1 \n" \
  1902. "punpcklwd %%xmm2,%%xmm0 \n" \
  1903. "punpckhwd %%xmm2,%%xmm1 \n" \
  1904. "movdqu %%xmm0,(%[dst_argb]) \n" \
  1905. "movdqu %%xmm1,0x10(%[dst_argb]) \n" \
  1906. "lea 0x20(%[dst_argb]), %[dst_argb] \n"
  1907. // Store 8 RGBA values.
  1908. #define STORERGBA \
  1909. "pcmpeqb %%xmm5,%%xmm5 \n" \
  1910. "punpcklbw %%xmm2,%%xmm1 \n" \
  1911. "punpcklbw %%xmm0,%%xmm5 \n" \
  1912. "movdqa %%xmm5,%%xmm0 \n" \
  1913. "punpcklwd %%xmm1,%%xmm5 \n" \
  1914. "punpckhwd %%xmm1,%%xmm0 \n" \
  1915. "movdqu %%xmm5,(%[dst_rgba]) \n" \
  1916. "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \
  1917. "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
  1918. // Store 8 AR30 values.
  1919. #define STOREAR30 \
  1920. "psraw $0x4,%%xmm0 \n" \
  1921. "psraw $0x4,%%xmm1 \n" \
  1922. "psraw $0x4,%%xmm2 \n" \
  1923. "pminsw %%xmm7,%%xmm0 \n" \
  1924. "pminsw %%xmm7,%%xmm1 \n" \
  1925. "pminsw %%xmm7,%%xmm2 \n" \
  1926. "pmaxsw %%xmm6,%%xmm0 \n" \
  1927. "pmaxsw %%xmm6,%%xmm1 \n" \
  1928. "pmaxsw %%xmm6,%%xmm2 \n" \
  1929. "psllw $0x4,%%xmm2 \n" \
  1930. "movdqa %%xmm0,%%xmm3 \n" \
  1931. "punpcklwd %%xmm2,%%xmm0 \n" \
  1932. "punpckhwd %%xmm2,%%xmm3 \n" \
  1933. "movdqa %%xmm1,%%xmm2 \n" \
  1934. "punpcklwd %%xmm5,%%xmm1 \n" \
  1935. "punpckhwd %%xmm5,%%xmm2 \n" \
  1936. "pslld $0xa,%%xmm1 \n" \
  1937. "pslld $0xa,%%xmm2 \n" \
  1938. "por %%xmm1,%%xmm0 \n" \
  1939. "por %%xmm2,%%xmm3 \n" \
  1940. "movdqu %%xmm0,(%[dst_ar30]) \n" \
  1941. "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \
  1942. "lea 0x20(%[dst_ar30]), %[dst_ar30] \n"
  1943. void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
  1944. const uint8_t* u_buf,
  1945. const uint8_t* v_buf,
  1946. uint8_t* dst_argb,
  1947. const struct YuvConstants* yuvconstants,
  1948. int width) {
  1949. asm volatile (
  1950. YUVTORGB_SETUP(yuvconstants)
  1951. "sub %[u_buf],%[v_buf] \n"
  1952. "pcmpeqb %%xmm5,%%xmm5 \n"
  1953. LABELALIGN
  1954. "1: \n"
  1955. READYUV444
  1956. YUVTORGB(yuvconstants)
  1957. STOREARGB
  1958. "sub $0x8,%[width] \n"
  1959. "jg 1b \n"
  1960. : [y_buf]"+r"(y_buf), // %[y_buf]
  1961. [u_buf]"+r"(u_buf), // %[u_buf]
  1962. [v_buf]"+r"(v_buf), // %[v_buf]
  1963. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  1964. [width]"+rm"(width) // %[width]
  1965. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  1966. : "memory", "cc", YUVTORGB_REGS
  1967. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  1968. );
  1969. }
  1970. void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
  1971. const uint8_t* u_buf,
  1972. const uint8_t* v_buf,
  1973. uint8_t* dst_rgb24,
  1974. const struct YuvConstants* yuvconstants,
  1975. int width) {
  1976. asm volatile (
  1977. YUVTORGB_SETUP(yuvconstants)
  1978. "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
  1979. "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
  1980. "sub %[u_buf],%[v_buf] \n"
  1981. LABELALIGN
  1982. "1: \n"
  1983. READYUV422
  1984. YUVTORGB(yuvconstants)
  1985. "punpcklbw %%xmm1,%%xmm0 \n"
  1986. "punpcklbw %%xmm2,%%xmm2 \n"
  1987. "movdqa %%xmm0,%%xmm1 \n"
  1988. "punpcklwd %%xmm2,%%xmm0 \n"
  1989. "punpckhwd %%xmm2,%%xmm1 \n"
  1990. "pshufb %%xmm5,%%xmm0 \n"
  1991. "pshufb %%xmm6,%%xmm1 \n"
  1992. "palignr $0xc,%%xmm0,%%xmm1 \n"
  1993. "movq %%xmm0,(%[dst_rgb24]) \n"
  1994. "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
  1995. "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
  1996. "subl $0x8,%[width] \n"
  1997. "jg 1b \n"
  1998. : [y_buf]"+r"(y_buf), // %[y_buf]
  1999. [u_buf]"+r"(u_buf), // %[u_buf]
  2000. [v_buf]"+r"(v_buf), // %[v_buf]
  2001. [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
  2002. #if defined(__i386__)
  2003. [width]"+m"(width) // %[width]
  2004. #else
  2005. [width]"+rm"(width) // %[width]
  2006. #endif
  2007. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2008. [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
  2009. [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
  2010. : "memory", "cc", YUVTORGB_REGS
  2011. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  2012. );
  2013. }
  2014. void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
  2015. const uint8_t* u_buf,
  2016. const uint8_t* v_buf,
  2017. uint8_t* dst_argb,
  2018. const struct YuvConstants* yuvconstants,
  2019. int width) {
  2020. asm volatile (
  2021. YUVTORGB_SETUP(yuvconstants)
  2022. "sub %[u_buf],%[v_buf] \n"
  2023. "pcmpeqb %%xmm5,%%xmm5 \n"
  2024. LABELALIGN
  2025. "1: \n"
  2026. READYUV422
  2027. YUVTORGB(yuvconstants)
  2028. STOREARGB
  2029. "sub $0x8,%[width] \n"
  2030. "jg 1b \n"
  2031. : [y_buf]"+r"(y_buf), // %[y_buf]
  2032. [u_buf]"+r"(u_buf), // %[u_buf]
  2033. [v_buf]"+r"(v_buf), // %[v_buf]
  2034. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2035. [width]"+rm"(width) // %[width]
  2036. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2037. : "memory", "cc", YUVTORGB_REGS
  2038. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2039. );
  2040. }
  2041. void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
  2042. const uint8_t* u_buf,
  2043. const uint8_t* v_buf,
  2044. uint8_t* dst_ar30,
  2045. const struct YuvConstants* yuvconstants,
  2046. int width) {
  2047. asm volatile (
  2048. YUVTORGB_SETUP(yuvconstants)
  2049. "sub %[u_buf],%[v_buf] \n"
  2050. "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
  2051. "psrlw $14,%%xmm5 \n"
  2052. "psllw $4,%%xmm5 \n" // 2 alpha bits
  2053. "pxor %%xmm6,%%xmm6 \n"
  2054. "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
  2055. "psrlw $6,%%xmm7 \n" // 1023 for max
  2056. LABELALIGN
  2057. "1: \n"
  2058. READYUV422
  2059. YUVTORGB16(yuvconstants)
  2060. STOREAR30
  2061. "sub $0x8,%[width] \n"
  2062. "jg 1b \n"
  2063. : [y_buf]"+r"(y_buf), // %[y_buf]
  2064. [u_buf]"+r"(u_buf), // %[u_buf]
  2065. [v_buf]"+r"(v_buf), // %[v_buf]
  2066. [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
  2067. [width]"+rm"(width) // %[width]
  2068. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2069. : "memory", "cc", YUVTORGB_REGS
  2070. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  2071. );
  2072. }
  2073. // 10 bit YUV to ARGB
  2074. void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
  2075. const uint16_t* u_buf,
  2076. const uint16_t* v_buf,
  2077. uint8_t* dst_argb,
  2078. const struct YuvConstants* yuvconstants,
  2079. int width) {
  2080. asm volatile (
  2081. YUVTORGB_SETUP(yuvconstants)
  2082. "sub %[u_buf],%[v_buf] \n"
  2083. "pcmpeqb %%xmm5,%%xmm5 \n"
  2084. LABELALIGN
  2085. "1: \n"
  2086. READYUV210
  2087. YUVTORGB(yuvconstants)
  2088. STOREARGB
  2089. "sub $0x8,%[width] \n"
  2090. "jg 1b \n"
  2091. : [y_buf]"+r"(y_buf), // %[y_buf]
  2092. [u_buf]"+r"(u_buf), // %[u_buf]
  2093. [v_buf]"+r"(v_buf), // %[v_buf]
  2094. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2095. [width]"+rm"(width) // %[width]
  2096. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2097. : "memory", "cc", YUVTORGB_REGS
  2098. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2099. );
  2100. }
  2101. // 10 bit YUV to AR30
  2102. void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
  2103. const uint16_t* u_buf,
  2104. const uint16_t* v_buf,
  2105. uint8_t* dst_ar30,
  2106. const struct YuvConstants* yuvconstants,
  2107. int width) {
  2108. asm volatile (
  2109. YUVTORGB_SETUP(yuvconstants)
  2110. "sub %[u_buf],%[v_buf] \n"
  2111. "pcmpeqb %%xmm5,%%xmm5 \n"
  2112. "psrlw $14,%%xmm5 \n"
  2113. "psllw $4,%%xmm5 \n" // 2 alpha bits
  2114. "pxor %%xmm6,%%xmm6 \n"
  2115. "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
  2116. "psrlw $6,%%xmm7 \n" // 1023 for max
  2117. LABELALIGN
  2118. "1: \n"
  2119. READYUV210
  2120. YUVTORGB16(yuvconstants)
  2121. STOREAR30
  2122. "sub $0x8,%[width] \n"
  2123. "jg 1b \n"
  2124. : [y_buf]"+r"(y_buf), // %[y_buf]
  2125. [u_buf]"+r"(u_buf), // %[u_buf]
  2126. [v_buf]"+r"(v_buf), // %[v_buf]
  2127. [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
  2128. [width]"+rm"(width) // %[width]
  2129. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2130. : "memory", "cc", YUVTORGB_REGS
  2131. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  2132. );
  2133. }
  2134. #ifdef HAS_I422ALPHATOARGBROW_SSSE3
  2135. void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
  2136. const uint8_t* u_buf,
  2137. const uint8_t* v_buf,
  2138. const uint8_t* a_buf,
  2139. uint8_t* dst_argb,
  2140. const struct YuvConstants* yuvconstants,
  2141. int width) {
  2142. // clang-format off
  2143. asm volatile (
  2144. YUVTORGB_SETUP(yuvconstants)
  2145. "sub %[u_buf],%[v_buf] \n"
  2146. LABELALIGN
  2147. "1: \n"
  2148. READYUVA422
  2149. YUVTORGB(yuvconstants)
  2150. STOREARGB
  2151. "subl $0x8,%[width] \n"
  2152. "jg 1b \n"
  2153. : [y_buf]"+r"(y_buf), // %[y_buf]
  2154. [u_buf]"+r"(u_buf), // %[u_buf]
  2155. [v_buf]"+r"(v_buf), // %[v_buf]
  2156. [a_buf]"+r"(a_buf), // %[a_buf]
  2157. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2158. #if defined(__i386__)
  2159. [width]"+m"(width) // %[width]
  2160. #else
  2161. [width]"+rm"(width) // %[width]
  2162. #endif
  2163. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2164. : "memory", "cc", YUVTORGB_REGS
  2165. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2166. );
  2167. // clang-format on
  2168. }
  2169. #endif // HAS_I422ALPHATOARGBROW_SSSE3
  2170. void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
  2171. const uint8_t* uv_buf,
  2172. uint8_t* dst_argb,
  2173. const struct YuvConstants* yuvconstants,
  2174. int width) {
  2175. // clang-format off
  2176. asm volatile (
  2177. YUVTORGB_SETUP(yuvconstants)
  2178. "pcmpeqb %%xmm5,%%xmm5 \n"
  2179. LABELALIGN
  2180. "1: \n"
  2181. READNV12
  2182. YUVTORGB(yuvconstants)
  2183. STOREARGB
  2184. "sub $0x8,%[width] \n"
  2185. "jg 1b \n"
  2186. : [y_buf]"+r"(y_buf), // %[y_buf]
  2187. [uv_buf]"+r"(uv_buf), // %[uv_buf]
  2188. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2189. [width]"+rm"(width) // %[width]
  2190. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2191. : "memory", "cc", YUVTORGB_REGS
  2192. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2193. );
  2194. // clang-format on
  2195. }
  2196. void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
  2197. const uint8_t* vu_buf,
  2198. uint8_t* dst_argb,
  2199. const struct YuvConstants* yuvconstants,
  2200. int width) {
  2201. // clang-format off
  2202. asm volatile (
  2203. YUVTORGB_SETUP(yuvconstants)
  2204. "pcmpeqb %%xmm5,%%xmm5 \n"
  2205. LABELALIGN
  2206. "1: \n"
  2207. READNV21
  2208. YUVTORGB(yuvconstants)
  2209. STOREARGB
  2210. "sub $0x8,%[width] \n"
  2211. "jg 1b \n"
  2212. : [y_buf]"+r"(y_buf), // %[y_buf]
  2213. [vu_buf]"+r"(vu_buf), // %[vu_buf]
  2214. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2215. [width]"+rm"(width) // %[width]
  2216. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2217. [kShuffleNV21]"m"(kShuffleNV21)
  2218. : "memory", "cc", YUVTORGB_REGS
  2219. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2220. );
  2221. // clang-format on
  2222. }
  2223. void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
  2224. uint8_t* dst_argb,
  2225. const struct YuvConstants* yuvconstants,
  2226. int width) {
  2227. // clang-format off
  2228. asm volatile (
  2229. YUVTORGB_SETUP(yuvconstants)
  2230. "pcmpeqb %%xmm5,%%xmm5 \n"
  2231. LABELALIGN
  2232. "1: \n"
  2233. READYUY2
  2234. YUVTORGB(yuvconstants)
  2235. STOREARGB
  2236. "sub $0x8,%[width] \n"
  2237. "jg 1b \n"
  2238. : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
  2239. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2240. [width]"+rm"(width) // %[width]
  2241. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2242. [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
  2243. [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
  2244. : "memory", "cc", YUVTORGB_REGS
  2245. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2246. );
  2247. // clang-format on
  2248. }
  2249. void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
  2250. uint8_t* dst_argb,
  2251. const struct YuvConstants* yuvconstants,
  2252. int width) {
  2253. // clang-format off
  2254. asm volatile (
  2255. YUVTORGB_SETUP(yuvconstants)
  2256. "pcmpeqb %%xmm5,%%xmm5 \n"
  2257. LABELALIGN
  2258. "1: \n"
  2259. READUYVY
  2260. YUVTORGB(yuvconstants)
  2261. STOREARGB
  2262. "sub $0x8,%[width] \n"
  2263. "jg 1b \n"
  2264. : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
  2265. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2266. [width]"+rm"(width) // %[width]
  2267. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2268. [kShuffleUYVYY]"m"(kShuffleUYVYY),
  2269. [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
  2270. : "memory", "cc", YUVTORGB_REGS
  2271. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2272. );
  2273. // clang-format on
  2274. }
  2275. void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
  2276. const uint8_t* u_buf,
  2277. const uint8_t* v_buf,
  2278. uint8_t* dst_rgba,
  2279. const struct YuvConstants* yuvconstants,
  2280. int width) {
  2281. asm volatile (
  2282. YUVTORGB_SETUP(yuvconstants)
  2283. "sub %[u_buf],%[v_buf] \n"
  2284. "pcmpeqb %%xmm5,%%xmm5 \n"
  2285. LABELALIGN
  2286. "1: \n"
  2287. READYUV422
  2288. YUVTORGB(yuvconstants)
  2289. STORERGBA
  2290. "sub $0x8,%[width] \n"
  2291. "jg 1b \n"
  2292. : [y_buf]"+r"(y_buf), // %[y_buf]
  2293. [u_buf]"+r"(u_buf), // %[u_buf]
  2294. [v_buf]"+r"(v_buf), // %[v_buf]
  2295. [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
  2296. [width]"+rm"(width) // %[width]
  2297. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2298. : "memory", "cc", YUVTORGB_REGS
  2299. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2300. );
  2301. }
  2302. #endif // HAS_I422TOARGBROW_SSSE3
  2303. // Read 16 UV from 444
  2304. #define READYUV444_AVX2 \
  2305. "vmovdqu (%[u_buf]),%%xmm0 \n" \
  2306. "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  2307. "lea 0x10(%[u_buf]),%[u_buf] \n" \
  2308. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2309. "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
  2310. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  2311. "vmovdqu (%[y_buf]),%%xmm4 \n" \
  2312. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  2313. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  2314. "lea 0x10(%[y_buf]),%[y_buf] \n"
  2315. // Read 8 UV from 422, upsample to 16 UV.
  2316. #define READYUV422_AVX2 \
  2317. "vmovq (%[u_buf]),%%xmm0 \n" \
  2318. "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  2319. "lea 0x8(%[u_buf]),%[u_buf] \n" \
  2320. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  2321. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2322. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  2323. "vmovdqu (%[y_buf]),%%xmm4 \n" \
  2324. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  2325. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  2326. "lea 0x10(%[y_buf]),%[y_buf] \n"
  2327. // Read 8 UV from 210 10 bit, upsample to 16 UV
  2328. // TODO(fbarchard): Consider vshufb to replace pack/unpack
  2329. // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
  2330. #define READYUV210_AVX2 \
  2331. "vmovdqu (%[u_buf]),%%xmm0 \n" \
  2332. "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  2333. "lea 0x10(%[u_buf]),%[u_buf] \n" \
  2334. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2335. "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
  2336. "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \
  2337. "vpsraw $0x2,%%ymm0,%%ymm0 \n" \
  2338. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
  2339. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  2340. "vmovdqu (%[y_buf]),%%ymm4 \n" \
  2341. "vpsllw $0x6,%%ymm4,%%ymm4 \n" \
  2342. "lea 0x20(%[y_buf]),%[y_buf] \n"
  2343. // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
  2344. #define READYUVA422_AVX2 \
  2345. "vmovq (%[u_buf]),%%xmm0 \n" \
  2346. "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
  2347. "lea 0x8(%[u_buf]),%[u_buf] \n" \
  2348. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  2349. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2350. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  2351. "vmovdqu (%[y_buf]),%%xmm4 \n" \
  2352. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  2353. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  2354. "lea 0x10(%[y_buf]),%[y_buf] \n" \
  2355. "vmovdqu (%[a_buf]),%%xmm5 \n" \
  2356. "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
  2357. "lea 0x10(%[a_buf]),%[a_buf] \n"
  2358. // Read 8 UV from NV12, upsample to 16 UV.
  2359. #define READNV12_AVX2 \
  2360. "vmovdqu (%[uv_buf]),%%xmm0 \n" \
  2361. "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
  2362. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2363. "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
  2364. "vmovdqu (%[y_buf]),%%xmm4 \n" \
  2365. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  2366. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  2367. "lea 0x10(%[y_buf]),%[y_buf] \n"
  2368. // Read 8 VU from NV21, upsample to 16 UV.
  2369. #define READNV21_AVX2 \
  2370. "vmovdqu (%[vu_buf]),%%xmm0 \n" \
  2371. "lea 0x10(%[vu_buf]),%[vu_buf] \n" \
  2372. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2373. "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
  2374. "vmovdqu (%[y_buf]),%%xmm4 \n" \
  2375. "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
  2376. "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
  2377. "lea 0x10(%[y_buf]),%[y_buf] \n"
  2378. // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
  2379. #define READYUY2_AVX2 \
  2380. "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \
  2381. "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
  2382. "vmovdqu (%[yuy2_buf]),%%ymm0 \n" \
  2383. "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
  2384. "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n"
  2385. // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
  2386. #define READUYVY_AVX2 \
  2387. "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \
  2388. "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
  2389. "vmovdqu (%[uyvy_buf]),%%ymm0 \n" \
  2390. "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
  2391. "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
  2392. #if defined(__x86_64__)
  2393. #define YUVTORGB_SETUP_AVX2(yuvconstants) \
  2394. "vmovdqa (%[yuvconstants]),%%ymm8 \n" \
  2395. "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
  2396. "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
  2397. "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
  2398. "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \
  2399. "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \
  2400. "vmovdqa 192(%[yuvconstants]),%%ymm14 \n"
  2401. #define YUVTORGB16_AVX2(yuvconstants) \
  2402. "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
  2403. "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
  2404. "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
  2405. "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
  2406. "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
  2407. "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
  2408. "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
  2409. "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
  2410. "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
  2411. "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
  2412. #define YUVTORGB_REGS_AVX2 \
  2413. "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
  2414. #else // Convert 16 pixels: 16 UV and 16 Y.
  2415. #define YUVTORGB_SETUP_AVX2(yuvconstants)
  2416. #define YUVTORGB16_AVX2(yuvconstants) \
  2417. "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \
  2418. "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \
  2419. "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \
  2420. "vmovdqu 160(%[yuvconstants]),%%ymm3 \n" \
  2421. "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
  2422. "vmovdqu 128(%[yuvconstants]),%%ymm3 \n" \
  2423. "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
  2424. "vmovdqu 96(%[yuvconstants]),%%ymm3 \n" \
  2425. "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
  2426. "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
  2427. "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
  2428. "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
  2429. "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
  2430. #define YUVTORGB_REGS_AVX2
  2431. #endif
  2432. #define YUVTORGB_AVX2(yuvconstants) \
  2433. YUVTORGB16_AVX2(yuvconstants) \
  2434. "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
  2435. "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
  2436. "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
  2437. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
  2438. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
  2439. "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
  2440. // Store 16 ARGB values.
  2441. #define STOREARGB_AVX2 \
  2442. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
  2443. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2444. "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
  2445. "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
  2446. "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
  2447. "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
  2448. "vmovdqu %%ymm1,(%[dst_argb]) \n" \
  2449. "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
  2450. "lea 0x40(%[dst_argb]), %[dst_argb] \n"
  2451. // Store 16 AR30 values.
  2452. #define STOREAR30_AVX2 \
  2453. "vpsraw $0x4,%%ymm0,%%ymm0 \n" \
  2454. "vpsraw $0x4,%%ymm1,%%ymm1 \n" \
  2455. "vpsraw $0x4,%%ymm2,%%ymm2 \n" \
  2456. "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \
  2457. "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \
  2458. "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \
  2459. "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \
  2460. "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \
  2461. "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \
  2462. "vpsllw $0x4,%%ymm2,%%ymm2 \n" \
  2463. "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
  2464. "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
  2465. "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
  2466. "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \
  2467. "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \
  2468. "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \
  2469. "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \
  2470. "vpslld $0xa,%%ymm1,%%ymm1 \n" \
  2471. "vpslld $0xa,%%ymm2,%%ymm2 \n" \
  2472. "vpor %%ymm1,%%ymm0,%%ymm0 \n" \
  2473. "vpor %%ymm2,%%ymm3,%%ymm3 \n" \
  2474. "vmovdqu %%ymm0,(%[dst_ar30]) \n" \
  2475. "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \
  2476. "lea 0x40(%[dst_ar30]), %[dst_ar30] \n"
  2477. #ifdef HAS_I444TOARGBROW_AVX2
  2478. // 16 pixels
  2479. // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
  2480. void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
  2481. const uint8_t* u_buf,
  2482. const uint8_t* v_buf,
  2483. uint8_t* dst_argb,
  2484. const struct YuvConstants* yuvconstants,
  2485. int width) {
  2486. asm volatile (
  2487. YUVTORGB_SETUP_AVX2(yuvconstants)
  2488. "sub %[u_buf],%[v_buf] \n"
  2489. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2490. LABELALIGN
  2491. "1: \n"
  2492. READYUV444_AVX2
  2493. YUVTORGB_AVX2(yuvconstants)
  2494. STOREARGB_AVX2
  2495. "sub $0x10,%[width] \n"
  2496. "jg 1b \n"
  2497. "vzeroupper \n"
  2498. : [y_buf]"+r"(y_buf), // %[y_buf]
  2499. [u_buf]"+r"(u_buf), // %[u_buf]
  2500. [v_buf]"+r"(v_buf), // %[v_buf]
  2501. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2502. [width]"+rm"(width) // %[width]
  2503. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2504. : "memory", "cc", YUVTORGB_REGS_AVX2
  2505. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2506. );
  2507. }
  2508. #endif // HAS_I444TOARGBROW_AVX2
  2509. #if defined(HAS_I422TOARGBROW_AVX2)
  2510. // 16 pixels
  2511. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2512. void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
  2513. const uint8_t* u_buf,
  2514. const uint8_t* v_buf,
  2515. uint8_t* dst_argb,
  2516. const struct YuvConstants* yuvconstants,
  2517. int width) {
  2518. asm volatile (
  2519. YUVTORGB_SETUP_AVX2(yuvconstants)
  2520. "sub %[u_buf],%[v_buf] \n"
  2521. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2522. LABELALIGN
  2523. "1: \n"
  2524. READYUV422_AVX2
  2525. YUVTORGB_AVX2(yuvconstants)
  2526. STOREARGB_AVX2
  2527. "sub $0x10,%[width] \n"
  2528. "jg 1b \n"
  2529. "vzeroupper \n"
  2530. : [y_buf]"+r"(y_buf), // %[y_buf]
  2531. [u_buf]"+r"(u_buf), // %[u_buf]
  2532. [v_buf]"+r"(v_buf), // %[v_buf]
  2533. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2534. [width]"+rm"(width) // %[width]
  2535. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2536. : "memory", "cc", YUVTORGB_REGS_AVX2
  2537. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2538. );
  2539. }
  2540. #endif // HAS_I422TOARGBROW_AVX2
  2541. #if defined(HAS_I422TOAR30ROW_AVX2)
  2542. // 16 pixels
  2543. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
  2544. void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
  2545. const uint8_t* u_buf,
  2546. const uint8_t* v_buf,
  2547. uint8_t* dst_ar30,
  2548. const struct YuvConstants* yuvconstants,
  2549. int width) {
  2550. asm volatile (
  2551. YUVTORGB_SETUP_AVX2(yuvconstants)
  2552. "sub %[u_buf],%[v_buf] \n"
  2553. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
  2554. "vpsrlw $14,%%ymm5,%%ymm5 \n"
  2555. "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
  2556. "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
  2557. "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
  2558. "vpsrlw $6,%%ymm7,%%ymm7 \n"
  2559. LABELALIGN
  2560. "1: \n"
  2561. READYUV422_AVX2
  2562. YUVTORGB16_AVX2(yuvconstants)
  2563. STOREAR30_AVX2
  2564. "sub $0x10,%[width] \n"
  2565. "jg 1b \n"
  2566. "vzeroupper \n"
  2567. : [y_buf]"+r"(y_buf), // %[y_buf]
  2568. [u_buf]"+r"(u_buf), // %[u_buf]
  2569. [v_buf]"+r"(v_buf), // %[v_buf]
  2570. [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
  2571. [width]"+rm"(width) // %[width]
  2572. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2573. : "memory", "cc", YUVTORGB_REGS_AVX2
  2574. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  2575. );
  2576. }
  2577. #endif // HAS_I422TOAR30ROW_AVX2
  2578. #if defined(HAS_I210TOARGBROW_AVX2)
  2579. // 16 pixels
  2580. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2581. void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
  2582. const uint16_t* u_buf,
  2583. const uint16_t* v_buf,
  2584. uint8_t* dst_argb,
  2585. const struct YuvConstants* yuvconstants,
  2586. int width) {
  2587. asm volatile (
  2588. YUVTORGB_SETUP_AVX2(yuvconstants)
  2589. "sub %[u_buf],%[v_buf] \n"
  2590. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2591. LABELALIGN
  2592. "1: \n"
  2593. READYUV210_AVX2
  2594. YUVTORGB_AVX2(yuvconstants)
  2595. STOREARGB_AVX2
  2596. "sub $0x10,%[width] \n"
  2597. "jg 1b \n"
  2598. "vzeroupper \n"
  2599. : [y_buf]"+r"(y_buf), // %[y_buf]
  2600. [u_buf]"+r"(u_buf), // %[u_buf]
  2601. [v_buf]"+r"(v_buf), // %[v_buf]
  2602. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2603. [width]"+rm"(width) // %[width]
  2604. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2605. : "memory", "cc", YUVTORGB_REGS_AVX2
  2606. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2607. );
  2608. }
  2609. #endif // HAS_I210TOARGBROW_AVX2
  2610. #if defined(HAS_I210TOAR30ROW_AVX2)
  2611. // 16 pixels
  2612. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
  2613. void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
  2614. const uint16_t* u_buf,
  2615. const uint16_t* v_buf,
  2616. uint8_t* dst_ar30,
  2617. const struct YuvConstants* yuvconstants,
  2618. int width) {
  2619. asm volatile (
  2620. YUVTORGB_SETUP_AVX2(yuvconstants)
  2621. "sub %[u_buf],%[v_buf] \n"
  2622. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
  2623. "vpsrlw $14,%%ymm5,%%ymm5 \n"
  2624. "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
  2625. "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
  2626. "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
  2627. "vpsrlw $6,%%ymm7,%%ymm7 \n"
  2628. LABELALIGN
  2629. "1: \n"
  2630. READYUV210_AVX2
  2631. YUVTORGB16_AVX2(yuvconstants)
  2632. STOREAR30_AVX2
  2633. "sub $0x10,%[width] \n"
  2634. "jg 1b \n"
  2635. "vzeroupper \n"
  2636. : [y_buf]"+r"(y_buf), // %[y_buf]
  2637. [u_buf]"+r"(u_buf), // %[u_buf]
  2638. [v_buf]"+r"(v_buf), // %[v_buf]
  2639. [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
  2640. [width]"+rm"(width) // %[width]
  2641. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2642. : "memory", "cc", YUVTORGB_REGS_AVX2
  2643. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2644. );
  2645. }
  2646. #endif // HAS_I210TOAR30ROW_AVX2
  2647. #if defined(HAS_I422ALPHATOARGBROW_AVX2)
  2648. // 16 pixels
  2649. // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
  2650. void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
  2651. const uint8_t* u_buf,
  2652. const uint8_t* v_buf,
  2653. const uint8_t* a_buf,
  2654. uint8_t* dst_argb,
  2655. const struct YuvConstants* yuvconstants,
  2656. int width) {
  2657. // clang-format off
  2658. asm volatile (
  2659. YUVTORGB_SETUP_AVX2(yuvconstants)
  2660. "sub %[u_buf],%[v_buf] \n"
  2661. LABELALIGN
  2662. "1: \n"
  2663. READYUVA422_AVX2
  2664. YUVTORGB_AVX2(yuvconstants)
  2665. STOREARGB_AVX2
  2666. "subl $0x10,%[width] \n"
  2667. "jg 1b \n"
  2668. "vzeroupper \n"
  2669. : [y_buf]"+r"(y_buf), // %[y_buf]
  2670. [u_buf]"+r"(u_buf), // %[u_buf]
  2671. [v_buf]"+r"(v_buf), // %[v_buf]
  2672. [a_buf]"+r"(a_buf), // %[a_buf]
  2673. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2674. #if defined(__i386__)
  2675. [width]"+m"(width) // %[width]
  2676. #else
  2677. [width]"+rm"(width) // %[width]
  2678. #endif
  2679. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2680. : "memory", "cc", YUVTORGB_REGS_AVX2
  2681. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2682. );
  2683. // clang-format on
  2684. }
  2685. #endif // HAS_I422ALPHATOARGBROW_AVX2
  2686. #if defined(HAS_I422TORGBAROW_AVX2)
  2687. // 16 pixels
  2688. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
  2689. void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
  2690. const uint8_t* u_buf,
  2691. const uint8_t* v_buf,
  2692. uint8_t* dst_argb,
  2693. const struct YuvConstants* yuvconstants,
  2694. int width) {
  2695. asm volatile (
  2696. YUVTORGB_SETUP_AVX2(yuvconstants)
  2697. "sub %[u_buf],%[v_buf] \n"
  2698. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2699. LABELALIGN
  2700. "1: \n"
  2701. READYUV422_AVX2
  2702. YUVTORGB_AVX2(yuvconstants)
  2703. // Step 3: Weave into RGBA
  2704. "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
  2705. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  2706. "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
  2707. "vpermq $0xd8,%%ymm2,%%ymm2 \n"
  2708. "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
  2709. "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
  2710. "vmovdqu %%ymm0,(%[dst_argb]) \n"
  2711. "vmovdqu %%ymm1,0x20(%[dst_argb]) \n"
  2712. "lea 0x40(%[dst_argb]),%[dst_argb] \n"
  2713. "sub $0x10,%[width] \n"
  2714. "jg 1b \n"
  2715. "vzeroupper \n"
  2716. : [y_buf]"+r"(y_buf), // %[y_buf]
  2717. [u_buf]"+r"(u_buf), // %[u_buf]
  2718. [v_buf]"+r"(v_buf), // %[v_buf]
  2719. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2720. [width]"+rm"(width) // %[width]
  2721. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2722. : "memory", "cc", YUVTORGB_REGS_AVX2
  2723. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2724. );
  2725. }
  2726. #endif // HAS_I422TORGBAROW_AVX2
  2727. #if defined(HAS_NV12TOARGBROW_AVX2)
  2728. // 16 pixels.
  2729. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2730. void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
  2731. const uint8_t* uv_buf,
  2732. uint8_t* dst_argb,
  2733. const struct YuvConstants* yuvconstants,
  2734. int width) {
  2735. // clang-format off
  2736. asm volatile (
  2737. YUVTORGB_SETUP_AVX2(yuvconstants)
  2738. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2739. LABELALIGN
  2740. "1: \n"
  2741. READNV12_AVX2
  2742. YUVTORGB_AVX2(yuvconstants)
  2743. STOREARGB_AVX2
  2744. "sub $0x10,%[width] \n"
  2745. "jg 1b \n"
  2746. "vzeroupper \n"
  2747. : [y_buf]"+r"(y_buf), // %[y_buf]
  2748. [uv_buf]"+r"(uv_buf), // %[uv_buf]
  2749. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2750. [width]"+rm"(width) // %[width]
  2751. : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
  2752. : "memory", "cc", YUVTORGB_REGS_AVX2
  2753. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2754. );
  2755. // clang-format on
  2756. }
  2757. #endif // HAS_NV12TOARGBROW_AVX2
  2758. #if defined(HAS_NV21TOARGBROW_AVX2)
  2759. // 16 pixels.
  2760. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
  2761. void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
  2762. const uint8_t* vu_buf,
  2763. uint8_t* dst_argb,
  2764. const struct YuvConstants* yuvconstants,
  2765. int width) {
  2766. // clang-format off
  2767. asm volatile (
  2768. YUVTORGB_SETUP_AVX2(yuvconstants)
  2769. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2770. LABELALIGN
  2771. "1: \n"
  2772. READNV21_AVX2
  2773. YUVTORGB_AVX2(yuvconstants)
  2774. STOREARGB_AVX2
  2775. "sub $0x10,%[width] \n"
  2776. "jg 1b \n"
  2777. "vzeroupper \n"
  2778. : [y_buf]"+r"(y_buf), // %[y_buf]
  2779. [vu_buf]"+r"(vu_buf), // %[vu_buf]
  2780. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2781. [width]"+rm"(width) // %[width]
  2782. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2783. [kShuffleNV21]"m"(kShuffleNV21)
  2784. : "memory", "cc", YUVTORGB_REGS_AVX2
  2785. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2786. );
  2787. // clang-format on
  2788. }
  2789. #endif // HAS_NV21TOARGBROW_AVX2
  2790. #if defined(HAS_YUY2TOARGBROW_AVX2)
  2791. // 16 pixels.
  2792. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2793. void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
  2794. uint8_t* dst_argb,
  2795. const struct YuvConstants* yuvconstants,
  2796. int width) {
  2797. // clang-format off
  2798. asm volatile (
  2799. YUVTORGB_SETUP_AVX2(yuvconstants)
  2800. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2801. LABELALIGN
  2802. "1: \n"
  2803. READYUY2_AVX2
  2804. YUVTORGB_AVX2(yuvconstants)
  2805. STOREARGB_AVX2
  2806. "sub $0x10,%[width] \n"
  2807. "jg 1b \n"
  2808. "vzeroupper \n"
  2809. : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
  2810. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2811. [width]"+rm"(width) // %[width]
  2812. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2813. [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
  2814. [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
  2815. : "memory", "cc", YUVTORGB_REGS_AVX2
  2816. "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2817. );
  2818. // clang-format on
  2819. }
  2820. #endif // HAS_YUY2TOARGBROW_AVX2
  2821. #if defined(HAS_UYVYTOARGBROW_AVX2)
  2822. // 16 pixels.
  2823. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
  2824. void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
  2825. uint8_t* dst_argb,
  2826. const struct YuvConstants* yuvconstants,
  2827. int width) {
  2828. // clang-format off
  2829. asm volatile (
  2830. YUVTORGB_SETUP_AVX2(yuvconstants)
  2831. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  2832. LABELALIGN
  2833. "1: \n"
  2834. READUYVY_AVX2
  2835. YUVTORGB_AVX2(yuvconstants)
  2836. STOREARGB_AVX2
  2837. "sub $0x10,%[width] \n"
  2838. "jg 1b \n"
  2839. "vzeroupper \n"
  2840. : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
  2841. [dst_argb]"+r"(dst_argb), // %[dst_argb]
  2842. [width]"+rm"(width) // %[width]
  2843. : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
  2844. [kShuffleUYVYY]"m"(kShuffleUYVYY),
  2845. [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
  2846. : "memory", "cc", YUVTORGB_REGS_AVX2
  2847. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  2848. );
  2849. // clang-format on
  2850. }
  2851. #endif // HAS_UYVYTOARGBROW_AVX2
  2852. #ifdef HAS_I400TOARGBROW_SSE2
  2853. void I400ToARGBRow_SSE2(const uint8_t* y_buf,
  2854. uint8_t* dst_argb,
  2855. const struct YuvConstants* yuvconstants,
  2856. int width) {
  2857. asm volatile(
  2858. "movdqa 192(%3),%%xmm2 \n" // yg = 18997 = 1.164
  2859. "movdqa 224(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16
  2860. "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000
  2861. "pslld $0x18,%%xmm4 \n"
  2862. LABELALIGN
  2863. "1: \n"
  2864. // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
  2865. "movq (%0),%%xmm0 \n"
  2866. "lea 0x8(%0),%0 \n"
  2867. "punpcklbw %%xmm0,%%xmm0 \n"
  2868. "pmulhuw %%xmm2,%%xmm0 \n"
  2869. "paddsw %%xmm3,%%xmm0 \n"
  2870. "psraw $6, %%xmm0 \n"
  2871. "packuswb %%xmm0,%%xmm0 \n"
  2872. // Step 2: Weave into ARGB
  2873. "punpcklbw %%xmm0,%%xmm0 \n"
  2874. "movdqa %%xmm0,%%xmm1 \n"
  2875. "punpcklwd %%xmm0,%%xmm0 \n"
  2876. "punpckhwd %%xmm1,%%xmm1 \n"
  2877. "por %%xmm4,%%xmm0 \n"
  2878. "por %%xmm4,%%xmm1 \n"
  2879. "movdqu %%xmm0,(%1) \n"
  2880. "movdqu %%xmm1,0x10(%1) \n"
  2881. "lea 0x20(%1),%1 \n"
  2882. "sub $0x8,%2 \n"
  2883. "jg 1b \n"
  2884. : "+r"(y_buf), // %0
  2885. "+r"(dst_argb), // %1
  2886. "+rm"(width) // %2
  2887. : "r"(yuvconstants) // %3
  2888. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
  2889. }
  2890. #endif // HAS_I400TOARGBROW_SSE2
  2891. #ifdef HAS_I400TOARGBROW_AVX2
  2892. // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
  2893. // note: vpunpcklbw mutates and vpackuswb unmutates.
  2894. void I400ToARGBRow_AVX2(const uint8_t* y_buf,
  2895. uint8_t* dst_argb,
  2896. const struct YuvConstants* yuvconstants,
  2897. int width) {
  2898. asm volatile(
  2899. "vmovdqa 192(%3),%%ymm2 \n" // yg = 18997 = 1.164
  2900. "vmovdqa 224(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16
  2901. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000
  2902. "vpslld $0x18,%%ymm4,%%ymm4 \n"
  2903. LABELALIGN
  2904. "1: \n"
  2905. // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
  2906. "vmovdqu (%0),%%xmm0 \n"
  2907. "lea 0x10(%0),%0 \n"
  2908. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  2909. "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
  2910. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  2911. "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n"
  2912. "vpsraw $0x6,%%ymm0,%%ymm0 \n"
  2913. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  2914. "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
  2915. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  2916. "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
  2917. "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
  2918. "vpor %%ymm4,%%ymm0,%%ymm0 \n"
  2919. "vpor %%ymm4,%%ymm1,%%ymm1 \n"
  2920. "vmovdqu %%ymm0,(%1) \n"
  2921. "vmovdqu %%ymm1,0x20(%1) \n"
  2922. "lea 0x40(%1),%1 \n"
  2923. "sub $0x10,%2 \n"
  2924. "jg 1b \n"
  2925. "vzeroupper \n"
  2926. : "+r"(y_buf), // %0
  2927. "+r"(dst_argb), // %1
  2928. "+rm"(width) // %2
  2929. : "r"(yuvconstants) // %3
  2930. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
  2931. }
  2932. #endif // HAS_I400TOARGBROW_AVX2
  2933. #ifdef HAS_MIRRORROW_SSSE3
  2934. // Shuffle table for reversing the bytes.
  2935. static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
  2936. 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
  2937. void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  2938. intptr_t temp_width = (intptr_t)(width);
  2939. asm volatile(
  2940. "movdqa %3,%%xmm5 \n"
  2941. LABELALIGN
  2942. "1: \n"
  2943. "movdqu -0x10(%0,%2,1),%%xmm0 \n"
  2944. "pshufb %%xmm5,%%xmm0 \n"
  2945. "movdqu %%xmm0,(%1) \n"
  2946. "lea 0x10(%1),%1 \n"
  2947. "sub $0x10,%2 \n"
  2948. "jg 1b \n"
  2949. : "+r"(src), // %0
  2950. "+r"(dst), // %1
  2951. "+r"(temp_width) // %2
  2952. : "m"(kShuffleMirror) // %3
  2953. : "memory", "cc", "xmm0", "xmm5");
  2954. }
  2955. #endif // HAS_MIRRORROW_SSSE3
  2956. #ifdef HAS_MIRRORROW_AVX2
  2957. void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  2958. intptr_t temp_width = (intptr_t)(width);
  2959. asm volatile(
  2960. "vbroadcastf128 %3,%%ymm5 \n"
  2961. LABELALIGN
  2962. "1: \n"
  2963. "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
  2964. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
  2965. "vpermq $0x4e,%%ymm0,%%ymm0 \n"
  2966. "vmovdqu %%ymm0,(%1) \n"
  2967. "lea 0x20(%1),%1 \n"
  2968. "sub $0x20,%2 \n"
  2969. "jg 1b \n"
  2970. "vzeroupper \n"
  2971. : "+r"(src), // %0
  2972. "+r"(dst), // %1
  2973. "+r"(temp_width) // %2
  2974. : "m"(kShuffleMirror) // %3
  2975. : "memory", "cc", "xmm0", "xmm5");
  2976. }
  2977. #endif // HAS_MIRRORROW_AVX2
  2978. #ifdef HAS_MIRRORUVROW_SSSE3
  2979. // Shuffle table for reversing the UV.
  2980. static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
  2981. 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
  2982. void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
  2983. intptr_t temp_width = (intptr_t)(width);
  2984. asm volatile(
  2985. "movdqa %3,%%xmm5 \n"
  2986. LABELALIGN
  2987. "1: \n"
  2988. "movdqu -0x10(%0,%2,2),%%xmm0 \n"
  2989. "pshufb %%xmm5,%%xmm0 \n"
  2990. "movdqu %%xmm0,(%1) \n"
  2991. "lea 0x10(%1),%1 \n"
  2992. "sub $0x8,%2 \n"
  2993. "jg 1b \n"
  2994. : "+r"(src_uv), // %0
  2995. "+r"(dst_uv), // %1
  2996. "+r"(temp_width) // %2
  2997. : "m"(kShuffleMirrorUV) // %3
  2998. : "memory", "cc", "xmm0", "xmm5");
  2999. }
  3000. #endif // HAS_MIRRORUVROW_SSSE3
  3001. #ifdef HAS_MIRRORUVROW_AVX2
  3002. void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
  3003. intptr_t temp_width = (intptr_t)(width);
  3004. asm volatile(
  3005. "vbroadcastf128 %3,%%ymm5 \n"
  3006. LABELALIGN
  3007. "1: \n"
  3008. "vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
  3009. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
  3010. "vpermq $0x4e,%%ymm0,%%ymm0 \n"
  3011. "vmovdqu %%ymm0,(%1) \n"
  3012. "lea 0x20(%1),%1 \n"
  3013. "sub $0x10,%2 \n"
  3014. "jg 1b \n"
  3015. "vzeroupper \n"
  3016. : "+r"(src_uv), // %0
  3017. "+r"(dst_uv), // %1
  3018. "+r"(temp_width) // %2
  3019. : "m"(kShuffleMirrorUV) // %3
  3020. : "memory", "cc", "xmm0", "xmm5");
  3021. }
  3022. #endif // HAS_MIRRORUVROW_AVX2
  3023. #ifdef HAS_MIRRORSPLITUVROW_SSSE3
  3024. // Shuffle table for reversing the bytes of UV channels.
  3025. static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
  3026. 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
  3027. void MirrorSplitUVRow_SSSE3(const uint8_t* src,
  3028. uint8_t* dst_u,
  3029. uint8_t* dst_v,
  3030. int width) {
  3031. intptr_t temp_width = (intptr_t)(width);
  3032. asm volatile(
  3033. "movdqa %4,%%xmm1 \n"
  3034. "lea -0x10(%0,%3,2),%0 \n"
  3035. "sub %1,%2 \n"
  3036. LABELALIGN
  3037. "1: \n"
  3038. "movdqu (%0),%%xmm0 \n"
  3039. "lea -0x10(%0),%0 \n"
  3040. "pshufb %%xmm1,%%xmm0 \n"
  3041. "movlpd %%xmm0,(%1) \n"
  3042. "movhpd %%xmm0,0x00(%1,%2,1) \n"
  3043. "lea 0x8(%1),%1 \n"
  3044. "sub $8,%3 \n"
  3045. "jg 1b \n"
  3046. : "+r"(src), // %0
  3047. "+r"(dst_u), // %1
  3048. "+r"(dst_v), // %2
  3049. "+r"(temp_width) // %3
  3050. : "m"(kShuffleMirrorSplitUV) // %4
  3051. : "memory", "cc", "xmm0", "xmm1");
  3052. }
  3053. #endif // HAS_MIRRORSPLITUVROW_SSSE3
  3054. #ifdef HAS_RGB24MIRRORROW_SSSE3
  3055. // Shuffle first 5 pixels to last 5 mirrored. first byte zero
  3056. static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
  3057. 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u};
  3058. // Shuffle last 5 pixels to first 5 mirrored. last byte zero
  3059. static const uvec8 kShuffleMirrorRGB1 = {
  3060. 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
  3061. // Shuffle 5 pixels at a time (15 bytes)
  3062. void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
  3063. uint8_t* dst_rgb24,
  3064. int width) {
  3065. intptr_t temp_width = (intptr_t)(width);
  3066. src_rgb24 += width * 3 - 48;
  3067. asm volatile(
  3068. "movdqa %3,%%xmm4 \n"
  3069. "movdqa %4,%%xmm5 \n"
  3070. LABELALIGN
  3071. "1: \n"
  3072. "movdqu (%0),%%xmm0 \n" // first 5
  3073. "movdqu 15(%0),%%xmm1 \n" // next 5
  3074. "movdqu 30(%0),%%xmm2 \n" // next 5
  3075. "movdqu 32(%0),%%xmm3 \n" // last 1 special
  3076. "pshufb %%xmm4,%%xmm0 \n"
  3077. "pshufb %%xmm4,%%xmm1 \n"
  3078. "pshufb %%xmm4,%%xmm2 \n"
  3079. "pshufb %%xmm5,%%xmm3 \n"
  3080. "lea -0x30(%0),%0 \n"
  3081. "movdqu %%xmm0,32(%1) \n" // last 5
  3082. "movdqu %%xmm1,17(%1) \n" // next 5
  3083. "movdqu %%xmm2,2(%1) \n" // next 5
  3084. "movlpd %%xmm3,0(%1) \n" // first 1
  3085. "lea 0x30(%1),%1 \n"
  3086. "sub $0x10,%2 \n"
  3087. "jg 1b \n"
  3088. : "+r"(src_rgb24), // %0
  3089. "+r"(dst_rgb24), // %1
  3090. "+r"(temp_width) // %2
  3091. : "m"(kShuffleMirrorRGB0), // %3
  3092. "m"(kShuffleMirrorRGB1) // %4
  3093. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  3094. }
  3095. #endif // HAS_RGB24MIRRORROW_SSSE3
  3096. #ifdef HAS_ARGBMIRRORROW_SSE2
  3097. void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  3098. intptr_t temp_width = (intptr_t)(width);
  3099. asm volatile(
  3100. "lea -0x10(%0,%2,4),%0 \n"
  3101. LABELALIGN
  3102. "1: \n"
  3103. "movdqu (%0),%%xmm0 \n"
  3104. "pshufd $0x1b,%%xmm0,%%xmm0 \n"
  3105. "lea -0x10(%0),%0 \n"
  3106. "movdqu %%xmm0,(%1) \n"
  3107. "lea 0x10(%1),%1 \n"
  3108. "sub $0x4,%2 \n"
  3109. "jg 1b \n"
  3110. : "+r"(src), // %0
  3111. "+r"(dst), // %1
  3112. "+r"(temp_width) // %2
  3113. :
  3114. : "memory", "cc", "xmm0");
  3115. }
  3116. #endif // HAS_ARGBMIRRORROW_SSE2
  3117. #ifdef HAS_ARGBMIRRORROW_AVX2
  3118. // Shuffle table for reversing the bytes.
  3119. static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
  3120. void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  3121. intptr_t temp_width = (intptr_t)(width);
  3122. asm volatile(
  3123. "vmovdqu %3,%%ymm5 \n"
  3124. LABELALIGN
  3125. "1: \n"
  3126. "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
  3127. "vmovdqu %%ymm0,(%1) \n"
  3128. "lea 0x20(%1),%1 \n"
  3129. "sub $0x8,%2 \n"
  3130. "jg 1b \n"
  3131. "vzeroupper \n"
  3132. : "+r"(src), // %0
  3133. "+r"(dst), // %1
  3134. "+r"(temp_width) // %2
  3135. : "m"(kARGBShuffleMirror_AVX2) // %3
  3136. : "memory", "cc", "xmm0", "xmm5");
  3137. }
  3138. #endif // HAS_ARGBMIRRORROW_AVX2
  3139. #ifdef HAS_SPLITUVROW_AVX2
  3140. void SplitUVRow_AVX2(const uint8_t* src_uv,
  3141. uint8_t* dst_u,
  3142. uint8_t* dst_v,
  3143. int width) {
  3144. asm volatile(
  3145. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  3146. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  3147. "sub %1,%2 \n"
  3148. LABELALIGN
  3149. "1: \n"
  3150. "vmovdqu (%0),%%ymm0 \n"
  3151. "vmovdqu 0x20(%0),%%ymm1 \n"
  3152. "lea 0x40(%0),%0 \n"
  3153. "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
  3154. "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
  3155. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  3156. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  3157. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  3158. "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
  3159. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3160. "vpermq $0xd8,%%ymm2,%%ymm2 \n"
  3161. "vmovdqu %%ymm0,(%1) \n"
  3162. "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
  3163. "lea 0x20(%1),%1 \n"
  3164. "sub $0x20,%3 \n"
  3165. "jg 1b \n"
  3166. "vzeroupper \n"
  3167. : "+r"(src_uv), // %0
  3168. "+r"(dst_u), // %1
  3169. "+r"(dst_v), // %2
  3170. "+r"(width) // %3
  3171. :
  3172. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  3173. }
  3174. #endif // HAS_SPLITUVROW_AVX2
  3175. #ifdef HAS_SPLITUVROW_SSE2
  3176. void SplitUVRow_SSE2(const uint8_t* src_uv,
  3177. uint8_t* dst_u,
  3178. uint8_t* dst_v,
  3179. int width) {
  3180. asm volatile(
  3181. "pcmpeqb %%xmm5,%%xmm5 \n"
  3182. "psrlw $0x8,%%xmm5 \n"
  3183. "sub %1,%2 \n"
  3184. LABELALIGN
  3185. "1: \n"
  3186. "movdqu (%0),%%xmm0 \n"
  3187. "movdqu 0x10(%0),%%xmm1 \n"
  3188. "lea 0x20(%0),%0 \n"
  3189. "movdqa %%xmm0,%%xmm2 \n"
  3190. "movdqa %%xmm1,%%xmm3 \n"
  3191. "pand %%xmm5,%%xmm0 \n"
  3192. "pand %%xmm5,%%xmm1 \n"
  3193. "packuswb %%xmm1,%%xmm0 \n"
  3194. "psrlw $0x8,%%xmm2 \n"
  3195. "psrlw $0x8,%%xmm3 \n"
  3196. "packuswb %%xmm3,%%xmm2 \n"
  3197. "movdqu %%xmm0,(%1) \n"
  3198. "movdqu %%xmm2,0x00(%1,%2,1) \n"
  3199. "lea 0x10(%1),%1 \n"
  3200. "sub $0x10,%3 \n"
  3201. "jg 1b \n"
  3202. : "+r"(src_uv), // %0
  3203. "+r"(dst_u), // %1
  3204. "+r"(dst_v), // %2
  3205. "+r"(width) // %3
  3206. :
  3207. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  3208. }
  3209. #endif // HAS_SPLITUVROW_SSE2
  3210. #ifdef HAS_MERGEUVROW_AVX2
  3211. void MergeUVRow_AVX2(const uint8_t* src_u,
  3212. const uint8_t* src_v,
  3213. uint8_t* dst_uv,
  3214. int width) {
  3215. asm volatile(
  3216. "sub %0,%1 \n"
  3217. LABELALIGN
  3218. "1: \n"
  3219. "vmovdqu (%0),%%ymm0 \n"
  3220. "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
  3221. "lea 0x20(%0),%0 \n"
  3222. "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
  3223. "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
  3224. "vextractf128 $0x0,%%ymm2,(%2) \n"
  3225. "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
  3226. "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
  3227. "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
  3228. "lea 0x40(%2),%2 \n"
  3229. "sub $0x20,%3 \n"
  3230. "jg 1b \n"
  3231. "vzeroupper \n"
  3232. : "+r"(src_u), // %0
  3233. "+r"(src_v), // %1
  3234. "+r"(dst_uv), // %2
  3235. "+r"(width) // %3
  3236. :
  3237. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3238. }
  3239. #endif // HAS_MERGEUVROW_AVX2
  3240. #ifdef HAS_MERGEUVROW_SSE2
  3241. void MergeUVRow_SSE2(const uint8_t* src_u,
  3242. const uint8_t* src_v,
  3243. uint8_t* dst_uv,
  3244. int width) {
  3245. asm volatile(
  3246. "sub %0,%1 \n"
  3247. LABELALIGN
  3248. "1: \n"
  3249. "movdqu (%0),%%xmm0 \n"
  3250. "movdqu 0x00(%0,%1,1),%%xmm1 \n"
  3251. "lea 0x10(%0),%0 \n"
  3252. "movdqa %%xmm0,%%xmm2 \n"
  3253. "punpcklbw %%xmm1,%%xmm0 \n"
  3254. "punpckhbw %%xmm1,%%xmm2 \n"
  3255. "movdqu %%xmm0,(%2) \n"
  3256. "movdqu %%xmm2,0x10(%2) \n"
  3257. "lea 0x20(%2),%2 \n"
  3258. "sub $0x10,%3 \n"
  3259. "jg 1b \n"
  3260. : "+r"(src_u), // %0
  3261. "+r"(src_v), // %1
  3262. "+r"(dst_uv), // %2
  3263. "+r"(width) // %3
  3264. :
  3265. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3266. }
  3267. #endif // HAS_MERGEUVROW_SSE2
  3268. // Use scale to convert lsb formats to msb, depending how many bits there are:
  3269. // 128 = 9 bits
  3270. // 64 = 10 bits
  3271. // 16 = 12 bits
  3272. // 1 = 16 bits
  3273. #ifdef HAS_MERGEUVROW_16_AVX2
  3274. void MergeUVRow_16_AVX2(const uint16_t* src_u,
  3275. const uint16_t* src_v,
  3276. uint16_t* dst_uv,
  3277. int scale,
  3278. int width) {
  3279. // clang-format off
  3280. asm volatile (
  3281. "vmovd %4,%%xmm3 \n"
  3282. "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
  3283. "vbroadcastss %%xmm3,%%ymm3 \n"
  3284. "sub %0,%1 \n"
  3285. // 16 pixels per loop.
  3286. LABELALIGN
  3287. "1: \n"
  3288. "vmovdqu (%0),%%ymm0 \n"
  3289. "vmovdqu (%0,%1,1),%%ymm1 \n"
  3290. "add $0x20,%0 \n"
  3291. "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
  3292. "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
  3293. "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
  3294. "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
  3295. "vextractf128 $0x0,%%ymm2,(%2) \n"
  3296. "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
  3297. "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
  3298. "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
  3299. "add $0x40,%2 \n"
  3300. "sub $0x10,%3 \n"
  3301. "jg 1b \n"
  3302. "vzeroupper \n"
  3303. : "+r"(src_u), // %0
  3304. "+r"(src_v), // %1
  3305. "+r"(dst_uv), // %2
  3306. "+r"(width) // %3
  3307. : "r"(scale) // %4
  3308. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
  3309. // clang-format on
  3310. }
  3311. #endif // HAS_MERGEUVROW_AVX2
  3312. // Use scale to convert lsb formats to msb, depending how many bits there are:
  3313. // 128 = 9 bits
  3314. // 64 = 10 bits
  3315. // 16 = 12 bits
  3316. // 1 = 16 bits
  3317. #ifdef HAS_MULTIPLYROW_16_AVX2
  3318. void MultiplyRow_16_AVX2(const uint16_t* src_y,
  3319. uint16_t* dst_y,
  3320. int scale,
  3321. int width) {
  3322. // clang-format off
  3323. asm volatile (
  3324. "vmovd %3,%%xmm3 \n"
  3325. "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
  3326. "vbroadcastss %%xmm3,%%ymm3 \n"
  3327. "sub %0,%1 \n"
  3328. // 16 pixels per loop.
  3329. LABELALIGN
  3330. "1: \n"
  3331. "vmovdqu (%0),%%ymm0 \n"
  3332. "vmovdqu 0x20(%0),%%ymm1 \n"
  3333. "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
  3334. "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
  3335. "vmovdqu %%ymm0,(%0,%1) \n"
  3336. "vmovdqu %%ymm1,0x20(%0,%1) \n"
  3337. "add $0x40,%0 \n"
  3338. "sub $0x20,%2 \n"
  3339. "jg 1b \n"
  3340. "vzeroupper \n"
  3341. : "+r"(src_y), // %0
  3342. "+r"(dst_y), // %1
  3343. "+r"(width) // %2
  3344. : "r"(scale) // %3
  3345. : "memory", "cc", "xmm0", "xmm1", "xmm3");
  3346. // clang-format on
  3347. }
  3348. #endif // HAS_MULTIPLYROW_16_AVX2
  3349. // Use scale to convert lsb formats to msb, depending how many bits there are:
  3350. // 32768 = 9 bits
  3351. // 16384 = 10 bits
  3352. // 4096 = 12 bits
  3353. // 256 = 16 bits
  3354. void Convert16To8Row_SSSE3(const uint16_t* src_y,
  3355. uint8_t* dst_y,
  3356. int scale,
  3357. int width) {
  3358. // clang-format off
  3359. asm volatile (
  3360. "movd %3,%%xmm2 \n"
  3361. "punpcklwd %%xmm2,%%xmm2 \n"
  3362. "pshufd $0x0,%%xmm2,%%xmm2 \n"
  3363. // 32 pixels per loop.
  3364. LABELALIGN
  3365. "1: \n"
  3366. "movdqu (%0),%%xmm0 \n"
  3367. "movdqu 0x10(%0),%%xmm1 \n"
  3368. "add $0x20,%0 \n"
  3369. "pmulhuw %%xmm2,%%xmm0 \n"
  3370. "pmulhuw %%xmm2,%%xmm1 \n"
  3371. "packuswb %%xmm1,%%xmm0 \n"
  3372. "movdqu %%xmm0,(%1) \n"
  3373. "add $0x10,%1 \n"
  3374. "sub $0x10,%2 \n"
  3375. "jg 1b \n"
  3376. : "+r"(src_y), // %0
  3377. "+r"(dst_y), // %1
  3378. "+r"(width) // %2
  3379. : "r"(scale) // %3
  3380. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3381. // clang-format on
  3382. }
  3383. #ifdef HAS_CONVERT16TO8ROW_AVX2
  3384. void Convert16To8Row_AVX2(const uint16_t* src_y,
  3385. uint8_t* dst_y,
  3386. int scale,
  3387. int width) {
  3388. // clang-format off
  3389. asm volatile (
  3390. "vmovd %3,%%xmm2 \n"
  3391. "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
  3392. "vbroadcastss %%xmm2,%%ymm2 \n"
  3393. // 32 pixels per loop.
  3394. LABELALIGN
  3395. "1: \n"
  3396. "vmovdqu (%0),%%ymm0 \n"
  3397. "vmovdqu 0x20(%0),%%ymm1 \n"
  3398. "add $0x40,%0 \n"
  3399. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  3400. "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
  3401. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
  3402. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3403. "vmovdqu %%ymm0,(%1) \n"
  3404. "add $0x20,%1 \n"
  3405. "sub $0x20,%2 \n"
  3406. "jg 1b \n"
  3407. "vzeroupper \n"
  3408. : "+r"(src_y), // %0
  3409. "+r"(dst_y), // %1
  3410. "+r"(width) // %2
  3411. : "r"(scale) // %3
  3412. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3413. // clang-format on
  3414. }
  3415. #endif // HAS_CONVERT16TO8ROW_AVX2
  3416. // Use scale to convert to lsb formats depending how many bits there are:
  3417. // 512 = 9 bits
  3418. // 1024 = 10 bits
  3419. // 4096 = 12 bits
  3420. // TODO(fbarchard): reduce to SSE2
  3421. void Convert8To16Row_SSE2(const uint8_t* src_y,
  3422. uint16_t* dst_y,
  3423. int scale,
  3424. int width) {
  3425. // clang-format off
  3426. asm volatile (
  3427. "movd %3,%%xmm2 \n"
  3428. "punpcklwd %%xmm2,%%xmm2 \n"
  3429. "pshufd $0x0,%%xmm2,%%xmm2 \n"
  3430. // 32 pixels per loop.
  3431. LABELALIGN
  3432. "1: \n"
  3433. "movdqu (%0),%%xmm0 \n"
  3434. "movdqa %%xmm0,%%xmm1 \n"
  3435. "punpcklbw %%xmm0,%%xmm0 \n"
  3436. "punpckhbw %%xmm1,%%xmm1 \n"
  3437. "add $0x10,%0 \n"
  3438. "pmulhuw %%xmm2,%%xmm0 \n"
  3439. "pmulhuw %%xmm2,%%xmm1 \n"
  3440. "movdqu %%xmm0,(%1) \n"
  3441. "movdqu %%xmm1,0x10(%1) \n"
  3442. "add $0x20,%1 \n"
  3443. "sub $0x10,%2 \n"
  3444. "jg 1b \n"
  3445. : "+r"(src_y), // %0
  3446. "+r"(dst_y), // %1
  3447. "+r"(width) // %2
  3448. : "r"(scale) // %3
  3449. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3450. // clang-format on
  3451. }
  3452. #ifdef HAS_CONVERT8TO16ROW_AVX2
  3453. void Convert8To16Row_AVX2(const uint8_t* src_y,
  3454. uint16_t* dst_y,
  3455. int scale,
  3456. int width) {
  3457. // clang-format off
  3458. asm volatile (
  3459. "vmovd %3,%%xmm2 \n"
  3460. "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
  3461. "vbroadcastss %%xmm2,%%ymm2 \n"
  3462. // 32 pixels per loop.
  3463. LABELALIGN
  3464. "1: \n"
  3465. "vmovdqu (%0),%%ymm0 \n"
  3466. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  3467. "add $0x20,%0 \n"
  3468. "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
  3469. "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
  3470. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  3471. "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
  3472. "vmovdqu %%ymm0,(%1) \n"
  3473. "vmovdqu %%ymm1,0x20(%1) \n"
  3474. "add $0x40,%1 \n"
  3475. "sub $0x20,%2 \n"
  3476. "jg 1b \n"
  3477. "vzeroupper \n"
  3478. : "+r"(src_y), // %0
  3479. "+r"(dst_y), // %1
  3480. "+r"(width) // %2
  3481. : "r"(scale) // %3
  3482. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3483. // clang-format on
  3484. }
  3485. #endif // HAS_CONVERT8TO16ROW_AVX2
  3486. #ifdef HAS_SPLITRGBROW_SSSE3
  3487. // Shuffle table for converting RGB to Planar.
  3488. static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u,
  3489. 128u, 128u, 128u, 128u, 128u, 128u,
  3490. 128u, 128u, 128u, 128u};
  3491. static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
  3492. 2u, 5u, 8u, 11u, 14u, 128u,
  3493. 128u, 128u, 128u, 128u};
  3494. static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
  3495. 128u, 128u, 128u, 128u, 128u, 1u,
  3496. 4u, 7u, 10u, 13u};
  3497. static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u,
  3498. 128u, 128u, 128u, 128u, 128u, 128u,
  3499. 128u, 128u, 128u, 128u};
  3500. static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
  3501. 3u, 6u, 9u, 12u, 15u, 128u,
  3502. 128u, 128u, 128u, 128u};
  3503. static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
  3504. 128u, 128u, 128u, 128u, 128u, 2u,
  3505. 5u, 8u, 11u, 14u};
  3506. static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u,
  3507. 128u, 128u, 128u, 128u, 128u, 128u,
  3508. 128u, 128u, 128u, 128u};
  3509. static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
  3510. 4u, 7u, 10u, 13u, 128u, 128u,
  3511. 128u, 128u, 128u, 128u};
  3512. static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
  3513. 128u, 128u, 128u, 128u, 0u, 3u,
  3514. 6u, 9u, 12u, 15u};
  3515. void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
  3516. uint8_t* dst_r,
  3517. uint8_t* dst_g,
  3518. uint8_t* dst_b,
  3519. int width) {
  3520. asm volatile(
  3521. LABELALIGN
  3522. "1: \n"
  3523. "movdqu (%0),%%xmm0 \n"
  3524. "movdqu 0x10(%0),%%xmm1 \n"
  3525. "movdqu 0x20(%0),%%xmm2 \n"
  3526. "pshufb %5, %%xmm0 \n"
  3527. "pshufb %6, %%xmm1 \n"
  3528. "pshufb %7, %%xmm2 \n"
  3529. "por %%xmm1,%%xmm0 \n"
  3530. "por %%xmm2,%%xmm0 \n"
  3531. "movdqu %%xmm0,(%1) \n"
  3532. "lea 0x10(%1),%1 \n"
  3533. "movdqu (%0),%%xmm0 \n"
  3534. "movdqu 0x10(%0),%%xmm1 \n"
  3535. "movdqu 0x20(%0),%%xmm2 \n"
  3536. "pshufb %8, %%xmm0 \n"
  3537. "pshufb %9, %%xmm1 \n"
  3538. "pshufb %10, %%xmm2 \n"
  3539. "por %%xmm1,%%xmm0 \n"
  3540. "por %%xmm2,%%xmm0 \n"
  3541. "movdqu %%xmm0,(%2) \n"
  3542. "lea 0x10(%2),%2 \n"
  3543. "movdqu (%0),%%xmm0 \n"
  3544. "movdqu 0x10(%0),%%xmm1 \n"
  3545. "movdqu 0x20(%0),%%xmm2 \n"
  3546. "pshufb %11, %%xmm0 \n"
  3547. "pshufb %12, %%xmm1 \n"
  3548. "pshufb %13, %%xmm2 \n"
  3549. "por %%xmm1,%%xmm0 \n"
  3550. "por %%xmm2,%%xmm0 \n"
  3551. "movdqu %%xmm0,(%3) \n"
  3552. "lea 0x10(%3),%3 \n"
  3553. "lea 0x30(%0),%0 \n"
  3554. "sub $0x10,%4 \n"
  3555. "jg 1b \n"
  3556. : "+r"(src_rgb), // %0
  3557. "+r"(dst_r), // %1
  3558. "+r"(dst_g), // %2
  3559. "+r"(dst_b), // %3
  3560. "+r"(width) // %4
  3561. : "m"(kShuffleMaskRGBToR0), // %5
  3562. "m"(kShuffleMaskRGBToR1), // %6
  3563. "m"(kShuffleMaskRGBToR2), // %7
  3564. "m"(kShuffleMaskRGBToG0), // %8
  3565. "m"(kShuffleMaskRGBToG1), // %9
  3566. "m"(kShuffleMaskRGBToG2), // %10
  3567. "m"(kShuffleMaskRGBToB0), // %11
  3568. "m"(kShuffleMaskRGBToB1), // %12
  3569. "m"(kShuffleMaskRGBToB2) // %13
  3570. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3571. }
  3572. #endif // HAS_SPLITRGBROW_SSSE3
  3573. #ifdef HAS_MERGERGBROW_SSSE3
  3574. // Shuffle table for converting RGB to Planar.
  3575. static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
  3576. 2u, 128u, 128u, 3u, 128u, 128u,
  3577. 4u, 128u, 128u, 5u};
  3578. static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
  3579. 128u, 2u, 128u, 128u, 3u, 128u,
  3580. 128u, 4u, 128u, 128u};
  3581. static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
  3582. 128u, 128u, 2u, 128u, 128u, 3u,
  3583. 128u, 128u, 4u, 128u};
  3584. static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
  3585. 7u, 128u, 128u, 8u, 128u, 128u,
  3586. 9u, 128u, 128u, 10u};
  3587. static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
  3588. 128u, 7u, 128u, 128u, 8u, 128u,
  3589. 128u, 9u, 128u, 128u};
  3590. static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u,
  3591. 128u, 128u, 8u, 128u, 128u, 9u,
  3592. 128u, 128u, 10u, 128u};
  3593. static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
  3594. 12u, 128u, 128u, 13u, 128u, 128u,
  3595. 14u, 128u, 128u, 15u};
  3596. static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
  3597. 128u, 13u, 128u, 128u, 14u, 128u,
  3598. 128u, 15u, 128u, 128u};
  3599. static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
  3600. 128u, 128u, 13u, 128u, 128u, 14u,
  3601. 128u, 128u, 15u, 128u};
  3602. void MergeRGBRow_SSSE3(const uint8_t* src_r,
  3603. const uint8_t* src_g,
  3604. const uint8_t* src_b,
  3605. uint8_t* dst_rgb,
  3606. int width) {
  3607. asm volatile(
  3608. LABELALIGN
  3609. "1: \n"
  3610. "movdqu (%0),%%xmm0 \n"
  3611. "movdqu (%1),%%xmm1 \n"
  3612. "movdqu (%2),%%xmm2 \n"
  3613. "pshufb %5, %%xmm0 \n"
  3614. "pshufb %6, %%xmm1 \n"
  3615. "pshufb %7, %%xmm2 \n"
  3616. "por %%xmm1,%%xmm0 \n"
  3617. "por %%xmm2,%%xmm0 \n"
  3618. "movdqu %%xmm0,(%3) \n"
  3619. "movdqu (%0),%%xmm0 \n"
  3620. "movdqu (%1),%%xmm1 \n"
  3621. "movdqu (%2),%%xmm2 \n"
  3622. "pshufb %8, %%xmm0 \n"
  3623. "pshufb %9, %%xmm1 \n"
  3624. "pshufb %10, %%xmm2 \n"
  3625. "por %%xmm1,%%xmm0 \n"
  3626. "por %%xmm2,%%xmm0 \n"
  3627. "movdqu %%xmm0,16(%3) \n"
  3628. "movdqu (%0),%%xmm0 \n"
  3629. "movdqu (%1),%%xmm1 \n"
  3630. "movdqu (%2),%%xmm2 \n"
  3631. "pshufb %11, %%xmm0 \n"
  3632. "pshufb %12, %%xmm1 \n"
  3633. "pshufb %13, %%xmm2 \n"
  3634. "por %%xmm1,%%xmm0 \n"
  3635. "por %%xmm2,%%xmm0 \n"
  3636. "movdqu %%xmm0,32(%3) \n"
  3637. "lea 0x10(%0),%0 \n"
  3638. "lea 0x10(%1),%1 \n"
  3639. "lea 0x10(%2),%2 \n"
  3640. "lea 0x30(%3),%3 \n"
  3641. "sub $0x10,%4 \n"
  3642. "jg 1b \n"
  3643. : "+r"(src_r), // %0
  3644. "+r"(src_g), // %1
  3645. "+r"(src_b), // %2
  3646. "+r"(dst_rgb), // %3
  3647. "+r"(width) // %4
  3648. : "m"(kShuffleMaskRToRGB0), // %5
  3649. "m"(kShuffleMaskGToRGB0), // %6
  3650. "m"(kShuffleMaskBToRGB0), // %7
  3651. "m"(kShuffleMaskRToRGB1), // %8
  3652. "m"(kShuffleMaskGToRGB1), // %9
  3653. "m"(kShuffleMaskBToRGB1), // %10
  3654. "m"(kShuffleMaskRToRGB2), // %11
  3655. "m"(kShuffleMaskGToRGB2), // %12
  3656. "m"(kShuffleMaskBToRGB2) // %13
  3657. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3658. }
  3659. #endif // HAS_MERGERGBROW_SSSE3
  3660. #ifdef HAS_COPYROW_SSE2
  3661. void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  3662. asm volatile(
  3663. "test $0xf,%0 \n"
  3664. "jne 2f \n"
  3665. "test $0xf,%1 \n"
  3666. "jne 2f \n"
  3667. LABELALIGN
  3668. "1: \n"
  3669. "movdqa (%0),%%xmm0 \n"
  3670. "movdqa 0x10(%0),%%xmm1 \n"
  3671. "lea 0x20(%0),%0 \n"
  3672. "movdqa %%xmm0,(%1) \n"
  3673. "movdqa %%xmm1,0x10(%1) \n"
  3674. "lea 0x20(%1),%1 \n"
  3675. "sub $0x20,%2 \n"
  3676. "jg 1b \n"
  3677. "jmp 9f \n"
  3678. LABELALIGN
  3679. "2: \n"
  3680. "movdqu (%0),%%xmm0 \n"
  3681. "movdqu 0x10(%0),%%xmm1 \n"
  3682. "lea 0x20(%0),%0 \n"
  3683. "movdqu %%xmm0,(%1) \n"
  3684. "movdqu %%xmm1,0x10(%1) \n"
  3685. "lea 0x20(%1),%1 \n"
  3686. "sub $0x20,%2 \n"
  3687. "jg 2b \n"
  3688. LABELALIGN "9: \n"
  3689. : "+r"(src), // %0
  3690. "+r"(dst), // %1
  3691. "+r"(width) // %2
  3692. :
  3693. : "memory", "cc", "xmm0", "xmm1");
  3694. }
  3695. #endif // HAS_COPYROW_SSE2
  3696. #ifdef HAS_COPYROW_AVX
  3697. void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
  3698. asm volatile(
  3699. LABELALIGN
  3700. "1: \n"
  3701. "vmovdqu (%0),%%ymm0 \n"
  3702. "vmovdqu 0x20(%0),%%ymm1 \n"
  3703. "lea 0x40(%0),%0 \n"
  3704. "vmovdqu %%ymm0,(%1) \n"
  3705. "vmovdqu %%ymm1,0x20(%1) \n"
  3706. "lea 0x40(%1),%1 \n"
  3707. "sub $0x40,%2 \n"
  3708. "jg 1b \n"
  3709. : "+r"(src), // %0
  3710. "+r"(dst), // %1
  3711. "+r"(width) // %2
  3712. :
  3713. : "memory", "cc", "xmm0", "xmm1");
  3714. }
  3715. #endif // HAS_COPYROW_AVX
  3716. #ifdef HAS_COPYROW_ERMS
  3717. // Multiple of 1.
  3718. void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
  3719. size_t width_tmp = (size_t)(width);
  3720. asm volatile(
  3721. "rep movsb \n"
  3722. : "+S"(src), // %0
  3723. "+D"(dst), // %1
  3724. "+c"(width_tmp) // %2
  3725. :
  3726. : "memory", "cc");
  3727. }
  3728. #endif // HAS_COPYROW_ERMS
  3729. #ifdef HAS_ARGBCOPYALPHAROW_SSE2
  3730. // width in pixels
  3731. void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  3732. asm volatile(
  3733. "pcmpeqb %%xmm0,%%xmm0 \n"
  3734. "pslld $0x18,%%xmm0 \n"
  3735. "pcmpeqb %%xmm1,%%xmm1 \n"
  3736. "psrld $0x8,%%xmm1 \n"
  3737. LABELALIGN
  3738. "1: \n"
  3739. "movdqu (%0),%%xmm2 \n"
  3740. "movdqu 0x10(%0),%%xmm3 \n"
  3741. "lea 0x20(%0),%0 \n"
  3742. "movdqu (%1),%%xmm4 \n"
  3743. "movdqu 0x10(%1),%%xmm5 \n"
  3744. "pand %%xmm0,%%xmm2 \n"
  3745. "pand %%xmm0,%%xmm3 \n"
  3746. "pand %%xmm1,%%xmm4 \n"
  3747. "pand %%xmm1,%%xmm5 \n"
  3748. "por %%xmm4,%%xmm2 \n"
  3749. "por %%xmm5,%%xmm3 \n"
  3750. "movdqu %%xmm2,(%1) \n"
  3751. "movdqu %%xmm3,0x10(%1) \n"
  3752. "lea 0x20(%1),%1 \n"
  3753. "sub $0x8,%2 \n"
  3754. "jg 1b \n"
  3755. : "+r"(src), // %0
  3756. "+r"(dst), // %1
  3757. "+r"(width) // %2
  3758. :
  3759. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  3760. }
  3761. #endif // HAS_ARGBCOPYALPHAROW_SSE2
  3762. #ifdef HAS_ARGBCOPYALPHAROW_AVX2
  3763. // width in pixels
  3764. void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  3765. asm volatile(
  3766. "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
  3767. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  3768. LABELALIGN
  3769. "1: \n"
  3770. "vmovdqu (%0),%%ymm1 \n"
  3771. "vmovdqu 0x20(%0),%%ymm2 \n"
  3772. "lea 0x40(%0),%0 \n"
  3773. "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
  3774. "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
  3775. "vmovdqu %%ymm1,(%1) \n"
  3776. "vmovdqu %%ymm2,0x20(%1) \n"
  3777. "lea 0x40(%1),%1 \n"
  3778. "sub $0x10,%2 \n"
  3779. "jg 1b \n"
  3780. "vzeroupper \n"
  3781. : "+r"(src), // %0
  3782. "+r"(dst), // %1
  3783. "+r"(width) // %2
  3784. :
  3785. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3786. }
  3787. #endif // HAS_ARGBCOPYALPHAROW_AVX2
  3788. #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
  3789. // width in pixels
  3790. void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
  3791. uint8_t* dst_a,
  3792. int width) {
  3793. asm volatile(
  3794. LABELALIGN
  3795. "1: \n"
  3796. "movdqu (%0), %%xmm0 \n"
  3797. "movdqu 0x10(%0), %%xmm1 \n"
  3798. "lea 0x20(%0), %0 \n"
  3799. "psrld $0x18, %%xmm0 \n"
  3800. "psrld $0x18, %%xmm1 \n"
  3801. "packssdw %%xmm1, %%xmm0 \n"
  3802. "packuswb %%xmm0, %%xmm0 \n"
  3803. "movq %%xmm0,(%1) \n"
  3804. "lea 0x8(%1), %1 \n"
  3805. "sub $0x8, %2 \n"
  3806. "jg 1b \n"
  3807. : "+r"(src_argb), // %0
  3808. "+r"(dst_a), // %1
  3809. "+rm"(width) // %2
  3810. :
  3811. : "memory", "cc", "xmm0", "xmm1");
  3812. }
  3813. #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
  3814. #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
  3815. static const uvec8 kShuffleAlphaShort_AVX2 = {
  3816. 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
  3817. 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
  3818. void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
  3819. uint8_t* dst_a,
  3820. int width) {
  3821. asm volatile(
  3822. "vmovdqa %3,%%ymm4 \n"
  3823. "vbroadcastf128 %4,%%ymm5 \n"
  3824. LABELALIGN
  3825. "1: \n"
  3826. "vmovdqu (%0), %%ymm0 \n"
  3827. "vmovdqu 0x20(%0), %%ymm1 \n"
  3828. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
  3829. "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
  3830. "vmovdqu 0x40(%0), %%ymm2 \n"
  3831. "vmovdqu 0x60(%0), %%ymm3 \n"
  3832. "lea 0x80(%0), %0 \n"
  3833. "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
  3834. "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
  3835. "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
  3836. "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
  3837. "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
  3838. "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
  3839. "vmovdqu %%ymm0,(%1) \n"
  3840. "lea 0x20(%1),%1 \n"
  3841. "sub $0x20, %2 \n"
  3842. "jg 1b \n"
  3843. "vzeroupper \n"
  3844. : "+r"(src_argb), // %0
  3845. "+r"(dst_a), // %1
  3846. "+rm"(width) // %2
  3847. : "m"(kPermdARGBToY_AVX), // %3
  3848. "m"(kShuffleAlphaShort_AVX2) // %4
  3849. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  3850. }
  3851. #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
  3852. #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
  3853. // width in pixels
  3854. void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  3855. asm volatile(
  3856. "pcmpeqb %%xmm0,%%xmm0 \n"
  3857. "pslld $0x18,%%xmm0 \n"
  3858. "pcmpeqb %%xmm1,%%xmm1 \n"
  3859. "psrld $0x8,%%xmm1 \n"
  3860. LABELALIGN
  3861. "1: \n"
  3862. "movq (%0),%%xmm2 \n"
  3863. "lea 0x8(%0),%0 \n"
  3864. "punpcklbw %%xmm2,%%xmm2 \n"
  3865. "punpckhwd %%xmm2,%%xmm3 \n"
  3866. "punpcklwd %%xmm2,%%xmm2 \n"
  3867. "movdqu (%1),%%xmm4 \n"
  3868. "movdqu 0x10(%1),%%xmm5 \n"
  3869. "pand %%xmm0,%%xmm2 \n"
  3870. "pand %%xmm0,%%xmm3 \n"
  3871. "pand %%xmm1,%%xmm4 \n"
  3872. "pand %%xmm1,%%xmm5 \n"
  3873. "por %%xmm4,%%xmm2 \n"
  3874. "por %%xmm5,%%xmm3 \n"
  3875. "movdqu %%xmm2,(%1) \n"
  3876. "movdqu %%xmm3,0x10(%1) \n"
  3877. "lea 0x20(%1),%1 \n"
  3878. "sub $0x8,%2 \n"
  3879. "jg 1b \n"
  3880. : "+r"(src), // %0
  3881. "+r"(dst), // %1
  3882. "+r"(width) // %2
  3883. :
  3884. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  3885. }
  3886. #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
  3887. #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
  3888. // width in pixels
  3889. void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  3890. asm volatile(
  3891. "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
  3892. "vpsrld $0x8,%%ymm0,%%ymm0 \n"
  3893. LABELALIGN
  3894. "1: \n"
  3895. "vpmovzxbd (%0),%%ymm1 \n"
  3896. "vpmovzxbd 0x8(%0),%%ymm2 \n"
  3897. "lea 0x10(%0),%0 \n"
  3898. "vpslld $0x18,%%ymm1,%%ymm1 \n"
  3899. "vpslld $0x18,%%ymm2,%%ymm2 \n"
  3900. "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
  3901. "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
  3902. "vmovdqu %%ymm1,(%1) \n"
  3903. "vmovdqu %%ymm2,0x20(%1) \n"
  3904. "lea 0x40(%1),%1 \n"
  3905. "sub $0x10,%2 \n"
  3906. "jg 1b \n"
  3907. "vzeroupper \n"
  3908. : "+r"(src), // %0
  3909. "+r"(dst), // %1
  3910. "+r"(width) // %2
  3911. :
  3912. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  3913. }
  3914. #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
  3915. #ifdef HAS_SETROW_X86
  3916. void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
  3917. size_t width_tmp = (size_t)(width >> 2);
  3918. const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
  3919. asm volatile(
  3920. "rep stosl \n"
  3921. : "+D"(dst), // %0
  3922. "+c"(width_tmp) // %1
  3923. : "a"(v32) // %2
  3924. : "memory", "cc");
  3925. }
  3926. void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
  3927. size_t width_tmp = (size_t)(width);
  3928. asm volatile(
  3929. "rep stosb \n"
  3930. : "+D"(dst), // %0
  3931. "+c"(width_tmp) // %1
  3932. : "a"(v8) // %2
  3933. : "memory", "cc");
  3934. }
  3935. void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
  3936. size_t width_tmp = (size_t)(width);
  3937. asm volatile(
  3938. "rep stosl \n"
  3939. : "+D"(dst_argb), // %0
  3940. "+c"(width_tmp) // %1
  3941. : "a"(v32) // %2
  3942. : "memory", "cc");
  3943. }
  3944. #endif // HAS_SETROW_X86
  3945. #ifdef HAS_YUY2TOYROW_SSE2
  3946. void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
  3947. asm volatile(
  3948. "pcmpeqb %%xmm5,%%xmm5 \n"
  3949. "psrlw $0x8,%%xmm5 \n"
  3950. LABELALIGN
  3951. "1: \n"
  3952. "movdqu (%0),%%xmm0 \n"
  3953. "movdqu 0x10(%0),%%xmm1 \n"
  3954. "lea 0x20(%0),%0 \n"
  3955. "pand %%xmm5,%%xmm0 \n"
  3956. "pand %%xmm5,%%xmm1 \n"
  3957. "packuswb %%xmm1,%%xmm0 \n"
  3958. "movdqu %%xmm0,(%1) \n"
  3959. "lea 0x10(%1),%1 \n"
  3960. "sub $0x10,%2 \n"
  3961. "jg 1b \n"
  3962. : "+r"(src_yuy2), // %0
  3963. "+r"(dst_y), // %1
  3964. "+r"(width) // %2
  3965. :
  3966. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  3967. }
  3968. void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
  3969. int stride_yuy2,
  3970. uint8_t* dst_u,
  3971. uint8_t* dst_v,
  3972. int width) {
  3973. asm volatile(
  3974. "pcmpeqb %%xmm5,%%xmm5 \n"
  3975. "psrlw $0x8,%%xmm5 \n"
  3976. "sub %1,%2 \n"
  3977. LABELALIGN
  3978. "1: \n"
  3979. "movdqu (%0),%%xmm0 \n"
  3980. "movdqu 0x10(%0),%%xmm1 \n"
  3981. "movdqu 0x00(%0,%4,1),%%xmm2 \n"
  3982. "movdqu 0x10(%0,%4,1),%%xmm3 \n"
  3983. "lea 0x20(%0),%0 \n"
  3984. "pavgb %%xmm2,%%xmm0 \n"
  3985. "pavgb %%xmm3,%%xmm1 \n"
  3986. "psrlw $0x8,%%xmm0 \n"
  3987. "psrlw $0x8,%%xmm1 \n"
  3988. "packuswb %%xmm1,%%xmm0 \n"
  3989. "movdqa %%xmm0,%%xmm1 \n"
  3990. "pand %%xmm5,%%xmm0 \n"
  3991. "packuswb %%xmm0,%%xmm0 \n"
  3992. "psrlw $0x8,%%xmm1 \n"
  3993. "packuswb %%xmm1,%%xmm1 \n"
  3994. "movq %%xmm0,(%1) \n"
  3995. "movq %%xmm1,0x00(%1,%2,1) \n"
  3996. "lea 0x8(%1),%1 \n"
  3997. "sub $0x10,%3 \n"
  3998. "jg 1b \n"
  3999. : "+r"(src_yuy2), // %0
  4000. "+r"(dst_u), // %1
  4001. "+r"(dst_v), // %2
  4002. "+r"(width) // %3
  4003. : "r"((intptr_t)(stride_yuy2)) // %4
  4004. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  4005. }
  4006. void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
  4007. uint8_t* dst_u,
  4008. uint8_t* dst_v,
  4009. int width) {
  4010. asm volatile(
  4011. "pcmpeqb %%xmm5,%%xmm5 \n"
  4012. "psrlw $0x8,%%xmm5 \n"
  4013. "sub %1,%2 \n"
  4014. LABELALIGN
  4015. "1: \n"
  4016. "movdqu (%0),%%xmm0 \n"
  4017. "movdqu 0x10(%0),%%xmm1 \n"
  4018. "lea 0x20(%0),%0 \n"
  4019. "psrlw $0x8,%%xmm0 \n"
  4020. "psrlw $0x8,%%xmm1 \n"
  4021. "packuswb %%xmm1,%%xmm0 \n"
  4022. "movdqa %%xmm0,%%xmm1 \n"
  4023. "pand %%xmm5,%%xmm0 \n"
  4024. "packuswb %%xmm0,%%xmm0 \n"
  4025. "psrlw $0x8,%%xmm1 \n"
  4026. "packuswb %%xmm1,%%xmm1 \n"
  4027. "movq %%xmm0,(%1) \n"
  4028. "movq %%xmm1,0x00(%1,%2,1) \n"
  4029. "lea 0x8(%1),%1 \n"
  4030. "sub $0x10,%3 \n"
  4031. "jg 1b \n"
  4032. : "+r"(src_yuy2), // %0
  4033. "+r"(dst_u), // %1
  4034. "+r"(dst_v), // %2
  4035. "+r"(width) // %3
  4036. :
  4037. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4038. }
  4039. void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
  4040. asm volatile(
  4041. LABELALIGN
  4042. "1: \n"
  4043. "movdqu (%0),%%xmm0 \n"
  4044. "movdqu 0x10(%0),%%xmm1 \n"
  4045. "lea 0x20(%0),%0 \n"
  4046. "psrlw $0x8,%%xmm0 \n"
  4047. "psrlw $0x8,%%xmm1 \n"
  4048. "packuswb %%xmm1,%%xmm0 \n"
  4049. "movdqu %%xmm0,(%1) \n"
  4050. "lea 0x10(%1),%1 \n"
  4051. "sub $0x10,%2 \n"
  4052. "jg 1b \n"
  4053. : "+r"(src_uyvy), // %0
  4054. "+r"(dst_y), // %1
  4055. "+r"(width) // %2
  4056. :
  4057. : "memory", "cc", "xmm0", "xmm1");
  4058. }
  4059. void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
  4060. int stride_uyvy,
  4061. uint8_t* dst_u,
  4062. uint8_t* dst_v,
  4063. int width) {
  4064. asm volatile(
  4065. "pcmpeqb %%xmm5,%%xmm5 \n"
  4066. "psrlw $0x8,%%xmm5 \n"
  4067. "sub %1,%2 \n"
  4068. LABELALIGN
  4069. "1: \n"
  4070. "movdqu (%0),%%xmm0 \n"
  4071. "movdqu 0x10(%0),%%xmm1 \n"
  4072. "movdqu 0x00(%0,%4,1),%%xmm2 \n"
  4073. "movdqu 0x10(%0,%4,1),%%xmm3 \n"
  4074. "lea 0x20(%0),%0 \n"
  4075. "pavgb %%xmm2,%%xmm0 \n"
  4076. "pavgb %%xmm3,%%xmm1 \n"
  4077. "pand %%xmm5,%%xmm0 \n"
  4078. "pand %%xmm5,%%xmm1 \n"
  4079. "packuswb %%xmm1,%%xmm0 \n"
  4080. "movdqa %%xmm0,%%xmm1 \n"
  4081. "pand %%xmm5,%%xmm0 \n"
  4082. "packuswb %%xmm0,%%xmm0 \n"
  4083. "psrlw $0x8,%%xmm1 \n"
  4084. "packuswb %%xmm1,%%xmm1 \n"
  4085. "movq %%xmm0,(%1) \n"
  4086. "movq %%xmm1,0x00(%1,%2,1) \n"
  4087. "lea 0x8(%1),%1 \n"
  4088. "sub $0x10,%3 \n"
  4089. "jg 1b \n"
  4090. : "+r"(src_uyvy), // %0
  4091. "+r"(dst_u), // %1
  4092. "+r"(dst_v), // %2
  4093. "+r"(width) // %3
  4094. : "r"((intptr_t)(stride_uyvy)) // %4
  4095. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  4096. }
  4097. void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
  4098. uint8_t* dst_u,
  4099. uint8_t* dst_v,
  4100. int width) {
  4101. asm volatile(
  4102. "pcmpeqb %%xmm5,%%xmm5 \n"
  4103. "psrlw $0x8,%%xmm5 \n"
  4104. "sub %1,%2 \n"
  4105. LABELALIGN
  4106. "1: \n"
  4107. "movdqu (%0),%%xmm0 \n"
  4108. "movdqu 0x10(%0),%%xmm1 \n"
  4109. "lea 0x20(%0),%0 \n"
  4110. "pand %%xmm5,%%xmm0 \n"
  4111. "pand %%xmm5,%%xmm1 \n"
  4112. "packuswb %%xmm1,%%xmm0 \n"
  4113. "movdqa %%xmm0,%%xmm1 \n"
  4114. "pand %%xmm5,%%xmm0 \n"
  4115. "packuswb %%xmm0,%%xmm0 \n"
  4116. "psrlw $0x8,%%xmm1 \n"
  4117. "packuswb %%xmm1,%%xmm1 \n"
  4118. "movq %%xmm0,(%1) \n"
  4119. "movq %%xmm1,0x00(%1,%2,1) \n"
  4120. "lea 0x8(%1),%1 \n"
  4121. "sub $0x10,%3 \n"
  4122. "jg 1b \n"
  4123. : "+r"(src_uyvy), // %0
  4124. "+r"(dst_u), // %1
  4125. "+r"(dst_v), // %2
  4126. "+r"(width) // %3
  4127. :
  4128. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4129. }
  4130. #endif // HAS_YUY2TOYROW_SSE2
  4131. #ifdef HAS_YUY2TOYROW_AVX2
  4132. void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
  4133. asm volatile(
  4134. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4135. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  4136. LABELALIGN
  4137. "1: \n"
  4138. "vmovdqu (%0),%%ymm0 \n"
  4139. "vmovdqu 0x20(%0),%%ymm1 \n"
  4140. "lea 0x40(%0),%0 \n"
  4141. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  4142. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  4143. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4144. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4145. "vmovdqu %%ymm0,(%1) \n"
  4146. "lea 0x20(%1),%1 \n"
  4147. "sub $0x20,%2 \n"
  4148. "jg 1b \n"
  4149. "vzeroupper \n"
  4150. : "+r"(src_yuy2), // %0
  4151. "+r"(dst_y), // %1
  4152. "+r"(width) // %2
  4153. :
  4154. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4155. }
  4156. void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
  4157. int stride_yuy2,
  4158. uint8_t* dst_u,
  4159. uint8_t* dst_v,
  4160. int width) {
  4161. asm volatile(
  4162. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4163. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  4164. "sub %1,%2 \n"
  4165. LABELALIGN
  4166. "1: \n"
  4167. "vmovdqu (%0),%%ymm0 \n"
  4168. "vmovdqu 0x20(%0),%%ymm1 \n"
  4169. "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
  4170. "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
  4171. "lea 0x40(%0),%0 \n"
  4172. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4173. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  4174. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4175. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4176. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  4177. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4178. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  4179. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  4180. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  4181. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4182. "vextractf128 $0x0,%%ymm1,(%1) \n"
  4183. "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
  4184. "lea 0x10(%1),%1 \n"
  4185. "sub $0x20,%3 \n"
  4186. "jg 1b \n"
  4187. "vzeroupper \n"
  4188. : "+r"(src_yuy2), // %0
  4189. "+r"(dst_u), // %1
  4190. "+r"(dst_v), // %2
  4191. "+r"(width) // %3
  4192. : "r"((intptr_t)(stride_yuy2)) // %4
  4193. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4194. }
  4195. void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
  4196. uint8_t* dst_u,
  4197. uint8_t* dst_v,
  4198. int width) {
  4199. asm volatile(
  4200. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4201. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  4202. "sub %1,%2 \n"
  4203. LABELALIGN
  4204. "1: \n"
  4205. "vmovdqu (%0),%%ymm0 \n"
  4206. "vmovdqu 0x20(%0),%%ymm1 \n"
  4207. "lea 0x40(%0),%0 \n"
  4208. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4209. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  4210. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4211. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4212. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  4213. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4214. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  4215. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  4216. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  4217. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4218. "vextractf128 $0x0,%%ymm1,(%1) \n"
  4219. "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
  4220. "lea 0x10(%1),%1 \n"
  4221. "sub $0x20,%3 \n"
  4222. "jg 1b \n"
  4223. "vzeroupper \n"
  4224. : "+r"(src_yuy2), // %0
  4225. "+r"(dst_u), // %1
  4226. "+r"(dst_v), // %2
  4227. "+r"(width) // %3
  4228. :
  4229. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4230. }
  4231. void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
  4232. asm volatile(
  4233. LABELALIGN
  4234. "1: \n"
  4235. "vmovdqu (%0),%%ymm0 \n"
  4236. "vmovdqu 0x20(%0),%%ymm1 \n"
  4237. "lea 0x40(%0),%0 \n"
  4238. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4239. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  4240. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4241. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4242. "vmovdqu %%ymm0,(%1) \n"
  4243. "lea 0x20(%1),%1 \n"
  4244. "sub $0x20,%2 \n"
  4245. "jg 1b \n"
  4246. "vzeroupper \n"
  4247. : "+r"(src_uyvy), // %0
  4248. "+r"(dst_y), // %1
  4249. "+r"(width) // %2
  4250. :
  4251. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4252. }
  4253. void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
  4254. int stride_uyvy,
  4255. uint8_t* dst_u,
  4256. uint8_t* dst_v,
  4257. int width) {
  4258. asm volatile(
  4259. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4260. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  4261. "sub %1,%2 \n"
  4262. LABELALIGN
  4263. "1: \n"
  4264. "vmovdqu (%0),%%ymm0 \n"
  4265. "vmovdqu 0x20(%0),%%ymm1 \n"
  4266. "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
  4267. "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
  4268. "lea 0x40(%0),%0 \n"
  4269. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  4270. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  4271. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4272. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4273. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  4274. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4275. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  4276. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  4277. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  4278. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4279. "vextractf128 $0x0,%%ymm1,(%1) \n"
  4280. "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
  4281. "lea 0x10(%1),%1 \n"
  4282. "sub $0x20,%3 \n"
  4283. "jg 1b \n"
  4284. "vzeroupper \n"
  4285. : "+r"(src_uyvy), // %0
  4286. "+r"(dst_u), // %1
  4287. "+r"(dst_v), // %2
  4288. "+r"(width) // %3
  4289. : "r"((intptr_t)(stride_uyvy)) // %4
  4290. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4291. }
  4292. void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
  4293. uint8_t* dst_u,
  4294. uint8_t* dst_v,
  4295. int width) {
  4296. asm volatile(
  4297. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4298. "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
  4299. "sub %1,%2 \n"
  4300. LABELALIGN
  4301. "1: \n"
  4302. "vmovdqu (%0),%%ymm0 \n"
  4303. "vmovdqu 0x20(%0),%%ymm1 \n"
  4304. "lea 0x40(%0),%0 \n"
  4305. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  4306. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  4307. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4308. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4309. "vpand %%ymm5,%%ymm0,%%ymm1 \n"
  4310. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4311. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  4312. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  4313. "vpermq $0xd8,%%ymm1,%%ymm1 \n"
  4314. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  4315. "vextractf128 $0x0,%%ymm1,(%1) \n"
  4316. "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
  4317. "lea 0x10(%1),%1 \n"
  4318. "sub $0x20,%3 \n"
  4319. "jg 1b \n"
  4320. "vzeroupper \n"
  4321. : "+r"(src_uyvy), // %0
  4322. "+r"(dst_u), // %1
  4323. "+r"(dst_v), // %2
  4324. "+r"(width) // %3
  4325. :
  4326. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  4327. }
  4328. #endif // HAS_YUY2TOYROW_AVX2
  4329. #ifdef HAS_ARGBBLENDROW_SSSE3
  4330. // Shuffle table for isolating alpha.
  4331. static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
  4332. 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
  4333. // Blend 8 pixels at a time
  4334. void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
  4335. const uint8_t* src_argb1,
  4336. uint8_t* dst_argb,
  4337. int width) {
  4338. asm volatile(
  4339. "pcmpeqb %%xmm7,%%xmm7 \n"
  4340. "psrlw $0xf,%%xmm7 \n"
  4341. "pcmpeqb %%xmm6,%%xmm6 \n"
  4342. "psrlw $0x8,%%xmm6 \n"
  4343. "pcmpeqb %%xmm5,%%xmm5 \n"
  4344. "psllw $0x8,%%xmm5 \n"
  4345. "pcmpeqb %%xmm4,%%xmm4 \n"
  4346. "pslld $0x18,%%xmm4 \n"
  4347. "sub $0x4,%3 \n"
  4348. "jl 49f \n"
  4349. // 4 pixel loop.
  4350. LABELALIGN
  4351. "40: \n"
  4352. "movdqu (%0),%%xmm3 \n"
  4353. "lea 0x10(%0),%0 \n"
  4354. "movdqa %%xmm3,%%xmm0 \n"
  4355. "pxor %%xmm4,%%xmm3 \n"
  4356. "movdqu (%1),%%xmm2 \n"
  4357. "pshufb %4,%%xmm3 \n"
  4358. "pand %%xmm6,%%xmm2 \n"
  4359. "paddw %%xmm7,%%xmm3 \n"
  4360. "pmullw %%xmm3,%%xmm2 \n"
  4361. "movdqu (%1),%%xmm1 \n"
  4362. "lea 0x10(%1),%1 \n"
  4363. "psrlw $0x8,%%xmm1 \n"
  4364. "por %%xmm4,%%xmm0 \n"
  4365. "pmullw %%xmm3,%%xmm1 \n"
  4366. "psrlw $0x8,%%xmm2 \n"
  4367. "paddusb %%xmm2,%%xmm0 \n"
  4368. "pand %%xmm5,%%xmm1 \n"
  4369. "paddusb %%xmm1,%%xmm0 \n"
  4370. "movdqu %%xmm0,(%2) \n"
  4371. "lea 0x10(%2),%2 \n"
  4372. "sub $0x4,%3 \n"
  4373. "jge 40b \n"
  4374. "49: \n"
  4375. "add $0x3,%3 \n"
  4376. "jl 99f \n"
  4377. // 1 pixel loop.
  4378. "91: \n"
  4379. "movd (%0),%%xmm3 \n"
  4380. "lea 0x4(%0),%0 \n"
  4381. "movdqa %%xmm3,%%xmm0 \n"
  4382. "pxor %%xmm4,%%xmm3 \n"
  4383. "movd (%1),%%xmm2 \n"
  4384. "pshufb %4,%%xmm3 \n"
  4385. "pand %%xmm6,%%xmm2 \n"
  4386. "paddw %%xmm7,%%xmm3 \n"
  4387. "pmullw %%xmm3,%%xmm2 \n"
  4388. "movd (%1),%%xmm1 \n"
  4389. "lea 0x4(%1),%1 \n"
  4390. "psrlw $0x8,%%xmm1 \n"
  4391. "por %%xmm4,%%xmm0 \n"
  4392. "pmullw %%xmm3,%%xmm1 \n"
  4393. "psrlw $0x8,%%xmm2 \n"
  4394. "paddusb %%xmm2,%%xmm0 \n"
  4395. "pand %%xmm5,%%xmm1 \n"
  4396. "paddusb %%xmm1,%%xmm0 \n"
  4397. "movd %%xmm0,(%2) \n"
  4398. "lea 0x4(%2),%2 \n"
  4399. "sub $0x1,%3 \n"
  4400. "jge 91b \n"
  4401. "99: \n"
  4402. : "+r"(src_argb0), // %0
  4403. "+r"(src_argb1), // %1
  4404. "+r"(dst_argb), // %2
  4405. "+r"(width) // %3
  4406. : "m"(kShuffleAlpha) // %4
  4407. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  4408. "xmm7");
  4409. }
  4410. #endif // HAS_ARGBBLENDROW_SSSE3
  4411. #ifdef HAS_BLENDPLANEROW_SSSE3
  4412. // Blend 8 pixels at a time.
  4413. // unsigned version of math
  4414. // =((A2*C2)+(B2*(255-C2))+255)/256
  4415. // signed version of math
  4416. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  4417. void BlendPlaneRow_SSSE3(const uint8_t* src0,
  4418. const uint8_t* src1,
  4419. const uint8_t* alpha,
  4420. uint8_t* dst,
  4421. int width) {
  4422. asm volatile(
  4423. "pcmpeqb %%xmm5,%%xmm5 \n"
  4424. "psllw $0x8,%%xmm5 \n"
  4425. "mov $0x80808080,%%eax \n"
  4426. "movd %%eax,%%xmm6 \n"
  4427. "pshufd $0x0,%%xmm6,%%xmm6 \n"
  4428. "mov $0x807f807f,%%eax \n"
  4429. "movd %%eax,%%xmm7 \n"
  4430. "pshufd $0x0,%%xmm7,%%xmm7 \n"
  4431. "sub %2,%0 \n"
  4432. "sub %2,%1 \n"
  4433. "sub %2,%3 \n"
  4434. // 8 pixel loop.
  4435. LABELALIGN
  4436. "1: \n"
  4437. "movq (%2),%%xmm0 \n"
  4438. "punpcklbw %%xmm0,%%xmm0 \n"
  4439. "pxor %%xmm5,%%xmm0 \n"
  4440. "movq (%0,%2,1),%%xmm1 \n"
  4441. "movq (%1,%2,1),%%xmm2 \n"
  4442. "punpcklbw %%xmm2,%%xmm1 \n"
  4443. "psubb %%xmm6,%%xmm1 \n"
  4444. "pmaddubsw %%xmm1,%%xmm0 \n"
  4445. "paddw %%xmm7,%%xmm0 \n"
  4446. "psrlw $0x8,%%xmm0 \n"
  4447. "packuswb %%xmm0,%%xmm0 \n"
  4448. "movq %%xmm0,(%3,%2,1) \n"
  4449. "lea 0x8(%2),%2 \n"
  4450. "sub $0x8,%4 \n"
  4451. "jg 1b \n"
  4452. : "+r"(src0), // %0
  4453. "+r"(src1), // %1
  4454. "+r"(alpha), // %2
  4455. "+r"(dst), // %3
  4456. "+rm"(width) // %4
  4457. ::"memory",
  4458. "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
  4459. }
  4460. #endif // HAS_BLENDPLANEROW_SSSE3
  4461. #ifdef HAS_BLENDPLANEROW_AVX2
  4462. // Blend 32 pixels at a time.
  4463. // unsigned version of math
  4464. // =((A2*C2)+(B2*(255-C2))+255)/256
  4465. // signed version of math
  4466. // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
  4467. void BlendPlaneRow_AVX2(const uint8_t* src0,
  4468. const uint8_t* src1,
  4469. const uint8_t* alpha,
  4470. uint8_t* dst,
  4471. int width) {
  4472. asm volatile(
  4473. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4474. "vpsllw $0x8,%%ymm5,%%ymm5 \n"
  4475. "mov $0x80808080,%%eax \n"
  4476. "vmovd %%eax,%%xmm6 \n"
  4477. "vbroadcastss %%xmm6,%%ymm6 \n"
  4478. "mov $0x807f807f,%%eax \n"
  4479. "vmovd %%eax,%%xmm7 \n"
  4480. "vbroadcastss %%xmm7,%%ymm7 \n"
  4481. "sub %2,%0 \n"
  4482. "sub %2,%1 \n"
  4483. "sub %2,%3 \n"
  4484. // 32 pixel loop.
  4485. LABELALIGN
  4486. "1: \n"
  4487. "vmovdqu (%2),%%ymm0 \n"
  4488. "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
  4489. "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
  4490. "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
  4491. "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
  4492. "vmovdqu (%0,%2,1),%%ymm1 \n"
  4493. "vmovdqu (%1,%2,1),%%ymm2 \n"
  4494. "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
  4495. "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
  4496. "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
  4497. "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
  4498. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  4499. "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
  4500. "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
  4501. "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
  4502. "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
  4503. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4504. "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
  4505. "vmovdqu %%ymm0,(%3,%2,1) \n"
  4506. "lea 0x20(%2),%2 \n"
  4507. "sub $0x20,%4 \n"
  4508. "jg 1b \n"
  4509. "vzeroupper \n"
  4510. : "+r"(src0), // %0
  4511. "+r"(src1), // %1
  4512. "+r"(alpha), // %2
  4513. "+r"(dst), // %3
  4514. "+rm"(width) // %4
  4515. ::"memory",
  4516. "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  4517. "xmm7");
  4518. }
  4519. #endif // HAS_BLENDPLANEROW_AVX2
  4520. #ifdef HAS_ARGBATTENUATEROW_SSSE3
  4521. // Shuffle table duplicating alpha.
  4522. static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
  4523. 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
  4524. static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
  4525. 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
  4526. // Attenuate 4 pixels at a time.
  4527. void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
  4528. uint8_t* dst_argb,
  4529. int width) {
  4530. asm volatile(
  4531. "pcmpeqb %%xmm3,%%xmm3 \n"
  4532. "pslld $0x18,%%xmm3 \n"
  4533. "movdqa %3,%%xmm4 \n"
  4534. "movdqa %4,%%xmm5 \n"
  4535. // 4 pixel loop.
  4536. LABELALIGN
  4537. "1: \n"
  4538. "movdqu (%0),%%xmm0 \n"
  4539. "pshufb %%xmm4,%%xmm0 \n"
  4540. "movdqu (%0),%%xmm1 \n"
  4541. "punpcklbw %%xmm1,%%xmm1 \n"
  4542. "pmulhuw %%xmm1,%%xmm0 \n"
  4543. "movdqu (%0),%%xmm1 \n"
  4544. "pshufb %%xmm5,%%xmm1 \n"
  4545. "movdqu (%0),%%xmm2 \n"
  4546. "punpckhbw %%xmm2,%%xmm2 \n"
  4547. "pmulhuw %%xmm2,%%xmm1 \n"
  4548. "movdqu (%0),%%xmm2 \n"
  4549. "lea 0x10(%0),%0 \n"
  4550. "pand %%xmm3,%%xmm2 \n"
  4551. "psrlw $0x8,%%xmm0 \n"
  4552. "psrlw $0x8,%%xmm1 \n"
  4553. "packuswb %%xmm1,%%xmm0 \n"
  4554. "por %%xmm2,%%xmm0 \n"
  4555. "movdqu %%xmm0,(%1) \n"
  4556. "lea 0x10(%1),%1 \n"
  4557. "sub $0x4,%2 \n"
  4558. "jg 1b \n"
  4559. : "+r"(src_argb), // %0
  4560. "+r"(dst_argb), // %1
  4561. "+r"(width) // %2
  4562. : "m"(kShuffleAlpha0), // %3
  4563. "m"(kShuffleAlpha1) // %4
  4564. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  4565. }
  4566. #endif // HAS_ARGBATTENUATEROW_SSSE3
  4567. #ifdef HAS_ARGBATTENUATEROW_AVX2
  4568. // Shuffle table duplicating alpha.
  4569. static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
  4570. 128u, 128u, 14u, 15u, 14u, 15u,
  4571. 14u, 15u, 128u, 128u};
  4572. // Attenuate 8 pixels at a time.
  4573. void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
  4574. uint8_t* dst_argb,
  4575. int width) {
  4576. asm volatile(
  4577. "vbroadcastf128 %3,%%ymm4 \n"
  4578. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  4579. "vpslld $0x18,%%ymm5,%%ymm5 \n"
  4580. "sub %0,%1 \n"
  4581. // 8 pixel loop.
  4582. LABELALIGN
  4583. "1: \n"
  4584. "vmovdqu (%0),%%ymm6 \n"
  4585. "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
  4586. "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
  4587. "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
  4588. "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
  4589. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  4590. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  4591. "vpand %%ymm5,%%ymm6,%%ymm6 \n"
  4592. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  4593. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  4594. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4595. "vpor %%ymm6,%%ymm0,%%ymm0 \n"
  4596. "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
  4597. "lea 0x20(%0),%0 \n"
  4598. "sub $0x8,%2 \n"
  4599. "jg 1b \n"
  4600. "vzeroupper \n"
  4601. : "+r"(src_argb), // %0
  4602. "+r"(dst_argb), // %1
  4603. "+r"(width) // %2
  4604. : "m"(kShuffleAlpha_AVX2) // %3
  4605. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  4606. }
  4607. #endif // HAS_ARGBATTENUATEROW_AVX2
  4608. #ifdef HAS_ARGBUNATTENUATEROW_SSE2
  4609. // Unattenuate 4 pixels at a time.
  4610. void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
  4611. uint8_t* dst_argb,
  4612. int width) {
  4613. uintptr_t alpha;
  4614. asm volatile(
  4615. // 4 pixel loop.
  4616. LABELALIGN
  4617. "1: \n"
  4618. "movdqu (%0),%%xmm0 \n"
  4619. "movzb 0x03(%0),%3 \n"
  4620. "punpcklbw %%xmm0,%%xmm0 \n"
  4621. "movd 0x00(%4,%3,4),%%xmm2 \n"
  4622. "movzb 0x07(%0),%3 \n"
  4623. "movd 0x00(%4,%3,4),%%xmm3 \n"
  4624. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  4625. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  4626. "movlhps %%xmm3,%%xmm2 \n"
  4627. "pmulhuw %%xmm2,%%xmm0 \n"
  4628. "movdqu (%0),%%xmm1 \n"
  4629. "movzb 0x0b(%0),%3 \n"
  4630. "punpckhbw %%xmm1,%%xmm1 \n"
  4631. "movd 0x00(%4,%3,4),%%xmm2 \n"
  4632. "movzb 0x0f(%0),%3 \n"
  4633. "movd 0x00(%4,%3,4),%%xmm3 \n"
  4634. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  4635. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  4636. "movlhps %%xmm3,%%xmm2 \n"
  4637. "pmulhuw %%xmm2,%%xmm1 \n"
  4638. "lea 0x10(%0),%0 \n"
  4639. "packuswb %%xmm1,%%xmm0 \n"
  4640. "movdqu %%xmm0,(%1) \n"
  4641. "lea 0x10(%1),%1 \n"
  4642. "sub $0x4,%2 \n"
  4643. "jg 1b \n"
  4644. : "+r"(src_argb), // %0
  4645. "+r"(dst_argb), // %1
  4646. "+r"(width), // %2
  4647. "=&r"(alpha) // %3
  4648. : "r"(fixed_invtbl8) // %4
  4649. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  4650. }
  4651. #endif // HAS_ARGBUNATTENUATEROW_SSE2
  4652. #ifdef HAS_ARGBUNATTENUATEROW_AVX2
  4653. // Shuffle table duplicating alpha.
  4654. static const uvec8 kUnattenShuffleAlpha_AVX2 = {
  4655. 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
  4656. // Unattenuate 8 pixels at a time.
  4657. void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
  4658. uint8_t* dst_argb,
  4659. int width) {
  4660. uintptr_t alpha;
  4661. asm volatile(
  4662. "sub %0,%1 \n"
  4663. "vbroadcastf128 %5,%%ymm5 \n"
  4664. // 8 pixel loop.
  4665. LABELALIGN
  4666. "1: \n"
  4667. // replace VPGATHER
  4668. "movzb 0x03(%0),%3 \n"
  4669. "vmovd 0x00(%4,%3,4),%%xmm0 \n"
  4670. "movzb 0x07(%0),%3 \n"
  4671. "vmovd 0x00(%4,%3,4),%%xmm1 \n"
  4672. "movzb 0x0b(%0),%3 \n"
  4673. "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
  4674. "vmovd 0x00(%4,%3,4),%%xmm2 \n"
  4675. "movzb 0x0f(%0),%3 \n"
  4676. "vmovd 0x00(%4,%3,4),%%xmm3 \n"
  4677. "movzb 0x13(%0),%3 \n"
  4678. "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
  4679. "vmovd 0x00(%4,%3,4),%%xmm0 \n"
  4680. "movzb 0x17(%0),%3 \n"
  4681. "vmovd 0x00(%4,%3,4),%%xmm1 \n"
  4682. "movzb 0x1b(%0),%3 \n"
  4683. "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
  4684. "vmovd 0x00(%4,%3,4),%%xmm2 \n"
  4685. "movzb 0x1f(%0),%3 \n"
  4686. "vmovd 0x00(%4,%3,4),%%xmm3 \n"
  4687. "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
  4688. "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
  4689. "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
  4690. "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
  4691. // end of VPGATHER
  4692. "vmovdqu (%0),%%ymm6 \n"
  4693. "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
  4694. "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
  4695. "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
  4696. "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
  4697. "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
  4698. "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
  4699. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  4700. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  4701. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  4702. "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
  4703. "lea 0x20(%0),%0 \n"
  4704. "sub $0x8,%2 \n"
  4705. "jg 1b \n"
  4706. "vzeroupper \n"
  4707. : "+r"(src_argb), // %0
  4708. "+r"(dst_argb), // %1
  4709. "+r"(width), // %2
  4710. "=&r"(alpha) // %3
  4711. : "r"(fixed_invtbl8), // %4
  4712. "m"(kUnattenShuffleAlpha_AVX2) // %5
  4713. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  4714. "xmm7");
  4715. }
  4716. #endif // HAS_ARGBUNATTENUATEROW_AVX2
  4717. #ifdef HAS_ARGBGRAYROW_SSSE3
  4718. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
  4719. void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
  4720. asm volatile(
  4721. "movdqa %3,%%xmm4 \n"
  4722. "movdqa %4,%%xmm5 \n"
  4723. // 8 pixel loop.
  4724. LABELALIGN
  4725. "1: \n"
  4726. "movdqu (%0),%%xmm0 \n"
  4727. "movdqu 0x10(%0),%%xmm1 \n"
  4728. "psubb %%xmm5,%%xmm0 \n"
  4729. "psubb %%xmm5,%%xmm1 \n"
  4730. "movdqu %%xmm4,%%xmm6 \n"
  4731. "pmaddubsw %%xmm0,%%xmm6 \n"
  4732. "movdqu %%xmm4,%%xmm0 \n"
  4733. "pmaddubsw %%xmm1,%%xmm0 \n"
  4734. "phaddw %%xmm0,%%xmm6 \n"
  4735. "paddw %%xmm5,%%xmm6 \n"
  4736. "psrlw $0x8,%%xmm6 \n"
  4737. "packuswb %%xmm6,%%xmm6 \n"
  4738. "movdqu (%0),%%xmm2 \n"
  4739. "movdqu 0x10(%0),%%xmm3 \n"
  4740. "lea 0x20(%0),%0 \n"
  4741. "psrld $0x18,%%xmm2 \n"
  4742. "psrld $0x18,%%xmm3 \n"
  4743. "packuswb %%xmm3,%%xmm2 \n"
  4744. "packuswb %%xmm2,%%xmm2 \n"
  4745. "movdqa %%xmm6,%%xmm3 \n"
  4746. "punpcklbw %%xmm6,%%xmm6 \n"
  4747. "punpcklbw %%xmm2,%%xmm3 \n"
  4748. "movdqa %%xmm6,%%xmm1 \n"
  4749. "punpcklwd %%xmm3,%%xmm6 \n"
  4750. "punpckhwd %%xmm3,%%xmm1 \n"
  4751. "movdqu %%xmm6,(%1) \n"
  4752. "movdqu %%xmm1,0x10(%1) \n"
  4753. "lea 0x20(%1),%1 \n"
  4754. "sub $0x8,%2 \n"
  4755. "jg 1b \n"
  4756. : "+r"(src_argb), // %0
  4757. "+r"(dst_argb), // %1
  4758. "+r"(width) // %2
  4759. : "m"(kARGBToYJ), // %3
  4760. "m"(kSub128) // %4
  4761. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  4762. }
  4763. #endif // HAS_ARGBGRAYROW_SSSE3
  4764. #ifdef HAS_ARGBSEPIAROW_SSSE3
  4765. // b = (r * 35 + g * 68 + b * 17) >> 7
  4766. // g = (r * 45 + g * 88 + b * 22) >> 7
  4767. // r = (r * 50 + g * 98 + b * 24) >> 7
  4768. // Constant for ARGB color to sepia tone
  4769. static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
  4770. 17, 68, 35, 0, 17, 68, 35, 0};
  4771. static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
  4772. 22, 88, 45, 0, 22, 88, 45, 0};
  4773. static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
  4774. 24, 98, 50, 0, 24, 98, 50, 0};
  4775. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  4776. void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
  4777. asm volatile(
  4778. "movdqa %2,%%xmm2 \n"
  4779. "movdqa %3,%%xmm3 \n"
  4780. "movdqa %4,%%xmm4 \n"
  4781. // 8 pixel loop.
  4782. LABELALIGN
  4783. "1: \n"
  4784. "movdqu (%0),%%xmm0 \n"
  4785. "movdqu 0x10(%0),%%xmm6 \n"
  4786. "pmaddubsw %%xmm2,%%xmm0 \n"
  4787. "pmaddubsw %%xmm2,%%xmm6 \n"
  4788. "phaddw %%xmm6,%%xmm0 \n"
  4789. "psrlw $0x7,%%xmm0 \n"
  4790. "packuswb %%xmm0,%%xmm0 \n"
  4791. "movdqu (%0),%%xmm5 \n"
  4792. "movdqu 0x10(%0),%%xmm1 \n"
  4793. "pmaddubsw %%xmm3,%%xmm5 \n"
  4794. "pmaddubsw %%xmm3,%%xmm1 \n"
  4795. "phaddw %%xmm1,%%xmm5 \n"
  4796. "psrlw $0x7,%%xmm5 \n"
  4797. "packuswb %%xmm5,%%xmm5 \n"
  4798. "punpcklbw %%xmm5,%%xmm0 \n"
  4799. "movdqu (%0),%%xmm5 \n"
  4800. "movdqu 0x10(%0),%%xmm1 \n"
  4801. "pmaddubsw %%xmm4,%%xmm5 \n"
  4802. "pmaddubsw %%xmm4,%%xmm1 \n"
  4803. "phaddw %%xmm1,%%xmm5 \n"
  4804. "psrlw $0x7,%%xmm5 \n"
  4805. "packuswb %%xmm5,%%xmm5 \n"
  4806. "movdqu (%0),%%xmm6 \n"
  4807. "movdqu 0x10(%0),%%xmm1 \n"
  4808. "psrld $0x18,%%xmm6 \n"
  4809. "psrld $0x18,%%xmm1 \n"
  4810. "packuswb %%xmm1,%%xmm6 \n"
  4811. "packuswb %%xmm6,%%xmm6 \n"
  4812. "punpcklbw %%xmm6,%%xmm5 \n"
  4813. "movdqa %%xmm0,%%xmm1 \n"
  4814. "punpcklwd %%xmm5,%%xmm0 \n"
  4815. "punpckhwd %%xmm5,%%xmm1 \n"
  4816. "movdqu %%xmm0,(%0) \n"
  4817. "movdqu %%xmm1,0x10(%0) \n"
  4818. "lea 0x20(%0),%0 \n"
  4819. "sub $0x8,%1 \n"
  4820. "jg 1b \n"
  4821. : "+r"(dst_argb), // %0
  4822. "+r"(width) // %1
  4823. : "m"(kARGBToSepiaB), // %2
  4824. "m"(kARGBToSepiaG), // %3
  4825. "m"(kARGBToSepiaR) // %4
  4826. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  4827. }
  4828. #endif // HAS_ARGBSEPIAROW_SSSE3
  4829. #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
  4830. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  4831. // Same as Sepia except matrix is provided.
  4832. void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
  4833. uint8_t* dst_argb,
  4834. const int8_t* matrix_argb,
  4835. int width) {
  4836. asm volatile(
  4837. "movdqu (%3),%%xmm5 \n"
  4838. "pshufd $0x00,%%xmm5,%%xmm2 \n"
  4839. "pshufd $0x55,%%xmm5,%%xmm3 \n"
  4840. "pshufd $0xaa,%%xmm5,%%xmm4 \n"
  4841. "pshufd $0xff,%%xmm5,%%xmm5 \n"
  4842. // 8 pixel loop.
  4843. LABELALIGN
  4844. "1: \n"
  4845. "movdqu (%0),%%xmm0 \n"
  4846. "movdqu 0x10(%0),%%xmm7 \n"
  4847. "pmaddubsw %%xmm2,%%xmm0 \n"
  4848. "pmaddubsw %%xmm2,%%xmm7 \n"
  4849. "movdqu (%0),%%xmm6 \n"
  4850. "movdqu 0x10(%0),%%xmm1 \n"
  4851. "pmaddubsw %%xmm3,%%xmm6 \n"
  4852. "pmaddubsw %%xmm3,%%xmm1 \n"
  4853. "phaddsw %%xmm7,%%xmm0 \n"
  4854. "phaddsw %%xmm1,%%xmm6 \n"
  4855. "psraw $0x6,%%xmm0 \n"
  4856. "psraw $0x6,%%xmm6 \n"
  4857. "packuswb %%xmm0,%%xmm0 \n"
  4858. "packuswb %%xmm6,%%xmm6 \n"
  4859. "punpcklbw %%xmm6,%%xmm0 \n"
  4860. "movdqu (%0),%%xmm1 \n"
  4861. "movdqu 0x10(%0),%%xmm7 \n"
  4862. "pmaddubsw %%xmm4,%%xmm1 \n"
  4863. "pmaddubsw %%xmm4,%%xmm7 \n"
  4864. "phaddsw %%xmm7,%%xmm1 \n"
  4865. "movdqu (%0),%%xmm6 \n"
  4866. "movdqu 0x10(%0),%%xmm7 \n"
  4867. "pmaddubsw %%xmm5,%%xmm6 \n"
  4868. "pmaddubsw %%xmm5,%%xmm7 \n"
  4869. "phaddsw %%xmm7,%%xmm6 \n"
  4870. "psraw $0x6,%%xmm1 \n"
  4871. "psraw $0x6,%%xmm6 \n"
  4872. "packuswb %%xmm1,%%xmm1 \n"
  4873. "packuswb %%xmm6,%%xmm6 \n"
  4874. "punpcklbw %%xmm6,%%xmm1 \n"
  4875. "movdqa %%xmm0,%%xmm6 \n"
  4876. "punpcklwd %%xmm1,%%xmm0 \n"
  4877. "punpckhwd %%xmm1,%%xmm6 \n"
  4878. "movdqu %%xmm0,(%1) \n"
  4879. "movdqu %%xmm6,0x10(%1) \n"
  4880. "lea 0x20(%0),%0 \n"
  4881. "lea 0x20(%1),%1 \n"
  4882. "sub $0x8,%2 \n"
  4883. "jg 1b \n"
  4884. : "+r"(src_argb), // %0
  4885. "+r"(dst_argb), // %1
  4886. "+r"(width) // %2
  4887. : "r"(matrix_argb) // %3
  4888. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  4889. "xmm7");
  4890. }
  4891. #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
  4892. #ifdef HAS_ARGBQUANTIZEROW_SSE2
  4893. // Quantize 4 ARGB pixels (16 bytes).
  4894. void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
  4895. int scale,
  4896. int interval_size,
  4897. int interval_offset,
  4898. int width) {
  4899. asm volatile(
  4900. "movd %2,%%xmm2 \n"
  4901. "movd %3,%%xmm3 \n"
  4902. "movd %4,%%xmm4 \n"
  4903. "pshuflw $0x40,%%xmm2,%%xmm2 \n"
  4904. "pshufd $0x44,%%xmm2,%%xmm2 \n"
  4905. "pshuflw $0x40,%%xmm3,%%xmm3 \n"
  4906. "pshufd $0x44,%%xmm3,%%xmm3 \n"
  4907. "pshuflw $0x40,%%xmm4,%%xmm4 \n"
  4908. "pshufd $0x44,%%xmm4,%%xmm4 \n"
  4909. "pxor %%xmm5,%%xmm5 \n"
  4910. "pcmpeqb %%xmm6,%%xmm6 \n"
  4911. "pslld $0x18,%%xmm6 \n"
  4912. // 4 pixel loop.
  4913. LABELALIGN
  4914. "1: \n"
  4915. "movdqu (%0),%%xmm0 \n"
  4916. "punpcklbw %%xmm5,%%xmm0 \n"
  4917. "pmulhuw %%xmm2,%%xmm0 \n"
  4918. "movdqu (%0),%%xmm1 \n"
  4919. "punpckhbw %%xmm5,%%xmm1 \n"
  4920. "pmulhuw %%xmm2,%%xmm1 \n"
  4921. "pmullw %%xmm3,%%xmm0 \n"
  4922. "movdqu (%0),%%xmm7 \n"
  4923. "pmullw %%xmm3,%%xmm1 \n"
  4924. "pand %%xmm6,%%xmm7 \n"
  4925. "paddw %%xmm4,%%xmm0 \n"
  4926. "paddw %%xmm4,%%xmm1 \n"
  4927. "packuswb %%xmm1,%%xmm0 \n"
  4928. "por %%xmm7,%%xmm0 \n"
  4929. "movdqu %%xmm0,(%0) \n"
  4930. "lea 0x10(%0),%0 \n"
  4931. "sub $0x4,%1 \n"
  4932. "jg 1b \n"
  4933. : "+r"(dst_argb), // %0
  4934. "+r"(width) // %1
  4935. : "r"(scale), // %2
  4936. "r"(interval_size), // %3
  4937. "r"(interval_offset) // %4
  4938. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  4939. "xmm7");
  4940. }
  4941. #endif // HAS_ARGBQUANTIZEROW_SSE2
  4942. #ifdef HAS_ARGBSHADEROW_SSE2
  4943. // Shade 4 pixels at a time by specified value.
  4944. void ARGBShadeRow_SSE2(const uint8_t* src_argb,
  4945. uint8_t* dst_argb,
  4946. int width,
  4947. uint32_t value) {
  4948. asm volatile(
  4949. "movd %3,%%xmm2 \n"
  4950. "punpcklbw %%xmm2,%%xmm2 \n"
  4951. "punpcklqdq %%xmm2,%%xmm2 \n"
  4952. // 4 pixel loop.
  4953. LABELALIGN
  4954. "1: \n"
  4955. "movdqu (%0),%%xmm0 \n"
  4956. "lea 0x10(%0),%0 \n"
  4957. "movdqa %%xmm0,%%xmm1 \n"
  4958. "punpcklbw %%xmm0,%%xmm0 \n"
  4959. "punpckhbw %%xmm1,%%xmm1 \n"
  4960. "pmulhuw %%xmm2,%%xmm0 \n"
  4961. "pmulhuw %%xmm2,%%xmm1 \n"
  4962. "psrlw $0x8,%%xmm0 \n"
  4963. "psrlw $0x8,%%xmm1 \n"
  4964. "packuswb %%xmm1,%%xmm0 \n"
  4965. "movdqu %%xmm0,(%1) \n"
  4966. "lea 0x10(%1),%1 \n"
  4967. "sub $0x4,%2 \n"
  4968. "jg 1b \n"
  4969. : "+r"(src_argb), // %0
  4970. "+r"(dst_argb), // %1
  4971. "+r"(width) // %2
  4972. : "r"(value) // %3
  4973. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  4974. }
  4975. #endif // HAS_ARGBSHADEROW_SSE2
  4976. #ifdef HAS_ARGBMULTIPLYROW_SSE2
  4977. // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
  4978. void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
  4979. const uint8_t* src_argb1,
  4980. uint8_t* dst_argb,
  4981. int width) {
  4982. asm volatile(
  4983. "pxor %%xmm5,%%xmm5 \n"
  4984. // 4 pixel loop.
  4985. LABELALIGN
  4986. "1: \n"
  4987. "movdqu (%0),%%xmm0 \n"
  4988. "lea 0x10(%0),%0 \n"
  4989. "movdqu (%1),%%xmm2 \n"
  4990. "lea 0x10(%1),%1 \n"
  4991. "movdqu %%xmm0,%%xmm1 \n"
  4992. "movdqu %%xmm2,%%xmm3 \n"
  4993. "punpcklbw %%xmm0,%%xmm0 \n"
  4994. "punpckhbw %%xmm1,%%xmm1 \n"
  4995. "punpcklbw %%xmm5,%%xmm2 \n"
  4996. "punpckhbw %%xmm5,%%xmm3 \n"
  4997. "pmulhuw %%xmm2,%%xmm0 \n"
  4998. "pmulhuw %%xmm3,%%xmm1 \n"
  4999. "packuswb %%xmm1,%%xmm0 \n"
  5000. "movdqu %%xmm0,(%2) \n"
  5001. "lea 0x10(%2),%2 \n"
  5002. "sub $0x4,%3 \n"
  5003. "jg 1b \n"
  5004. : "+r"(src_argb0), // %0
  5005. "+r"(src_argb1), // %1
  5006. "+r"(dst_argb), // %2
  5007. "+r"(width) // %3
  5008. :
  5009. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  5010. }
  5011. #endif // HAS_ARGBMULTIPLYROW_SSE2
  5012. #ifdef HAS_ARGBMULTIPLYROW_AVX2
  5013. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  5014. void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
  5015. const uint8_t* src_argb1,
  5016. uint8_t* dst_argb,
  5017. int width) {
  5018. asm volatile(
  5019. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  5020. // 4 pixel loop.
  5021. LABELALIGN
  5022. "1: \n"
  5023. "vmovdqu (%0),%%ymm1 \n"
  5024. "lea 0x20(%0),%0 \n"
  5025. "vmovdqu (%1),%%ymm3 \n"
  5026. "lea 0x20(%1),%1 \n"
  5027. "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
  5028. "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
  5029. "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
  5030. "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
  5031. "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
  5032. "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
  5033. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  5034. "vmovdqu %%ymm0,(%2) \n"
  5035. "lea 0x20(%2),%2 \n"
  5036. "sub $0x8,%3 \n"
  5037. "jg 1b \n"
  5038. "vzeroupper \n"
  5039. : "+r"(src_argb0), // %0
  5040. "+r"(src_argb1), // %1
  5041. "+r"(dst_argb), // %2
  5042. "+r"(width) // %3
  5043. :
  5044. : "memory", "cc"
  5045. #if defined(__AVX2__)
  5046. ,
  5047. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  5048. #endif
  5049. );
  5050. }
  5051. #endif // HAS_ARGBMULTIPLYROW_AVX2
  5052. #ifdef HAS_ARGBADDROW_SSE2
  5053. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  5054. void ARGBAddRow_SSE2(const uint8_t* src_argb0,
  5055. const uint8_t* src_argb1,
  5056. uint8_t* dst_argb,
  5057. int width) {
  5058. asm volatile(
  5059. // 4 pixel loop.
  5060. LABELALIGN
  5061. "1: \n"
  5062. "movdqu (%0),%%xmm0 \n"
  5063. "lea 0x10(%0),%0 \n"
  5064. "movdqu (%1),%%xmm1 \n"
  5065. "lea 0x10(%1),%1 \n"
  5066. "paddusb %%xmm1,%%xmm0 \n"
  5067. "movdqu %%xmm0,(%2) \n"
  5068. "lea 0x10(%2),%2 \n"
  5069. "sub $0x4,%3 \n"
  5070. "jg 1b \n"
  5071. : "+r"(src_argb0), // %0
  5072. "+r"(src_argb1), // %1
  5073. "+r"(dst_argb), // %2
  5074. "+r"(width) // %3
  5075. :
  5076. : "memory", "cc", "xmm0", "xmm1");
  5077. }
  5078. #endif // HAS_ARGBADDROW_SSE2
  5079. #ifdef HAS_ARGBADDROW_AVX2
  5080. // Add 2 rows of ARGB pixels together, 4 pixels at a time.
  5081. void ARGBAddRow_AVX2(const uint8_t* src_argb0,
  5082. const uint8_t* src_argb1,
  5083. uint8_t* dst_argb,
  5084. int width) {
  5085. asm volatile(
  5086. // 4 pixel loop.
  5087. LABELALIGN
  5088. "1: \n"
  5089. "vmovdqu (%0),%%ymm0 \n"
  5090. "lea 0x20(%0),%0 \n"
  5091. "vpaddusb (%1),%%ymm0,%%ymm0 \n"
  5092. "lea 0x20(%1),%1 \n"
  5093. "vmovdqu %%ymm0,(%2) \n"
  5094. "lea 0x20(%2),%2 \n"
  5095. "sub $0x8,%3 \n"
  5096. "jg 1b \n"
  5097. "vzeroupper \n"
  5098. : "+r"(src_argb0), // %0
  5099. "+r"(src_argb1), // %1
  5100. "+r"(dst_argb), // %2
  5101. "+r"(width) // %3
  5102. :
  5103. : "memory", "cc", "xmm0");
  5104. }
  5105. #endif // HAS_ARGBADDROW_AVX2
  5106. #ifdef HAS_ARGBSUBTRACTROW_SSE2
  5107. // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
  5108. void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
  5109. const uint8_t* src_argb1,
  5110. uint8_t* dst_argb,
  5111. int width) {
  5112. asm volatile(
  5113. // 4 pixel loop.
  5114. LABELALIGN
  5115. "1: \n"
  5116. "movdqu (%0),%%xmm0 \n"
  5117. "lea 0x10(%0),%0 \n"
  5118. "movdqu (%1),%%xmm1 \n"
  5119. "lea 0x10(%1),%1 \n"
  5120. "psubusb %%xmm1,%%xmm0 \n"
  5121. "movdqu %%xmm0,(%2) \n"
  5122. "lea 0x10(%2),%2 \n"
  5123. "sub $0x4,%3 \n"
  5124. "jg 1b \n"
  5125. : "+r"(src_argb0), // %0
  5126. "+r"(src_argb1), // %1
  5127. "+r"(dst_argb), // %2
  5128. "+r"(width) // %3
  5129. :
  5130. : "memory", "cc", "xmm0", "xmm1");
  5131. }
  5132. #endif // HAS_ARGBSUBTRACTROW_SSE2
  5133. #ifdef HAS_ARGBSUBTRACTROW_AVX2
  5134. // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
  5135. void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
  5136. const uint8_t* src_argb1,
  5137. uint8_t* dst_argb,
  5138. int width) {
  5139. asm volatile(
  5140. // 4 pixel loop.
  5141. LABELALIGN
  5142. "1: \n"
  5143. "vmovdqu (%0),%%ymm0 \n"
  5144. "lea 0x20(%0),%0 \n"
  5145. "vpsubusb (%1),%%ymm0,%%ymm0 \n"
  5146. "lea 0x20(%1),%1 \n"
  5147. "vmovdqu %%ymm0,(%2) \n"
  5148. "lea 0x20(%2),%2 \n"
  5149. "sub $0x8,%3 \n"
  5150. "jg 1b \n"
  5151. "vzeroupper \n"
  5152. : "+r"(src_argb0), // %0
  5153. "+r"(src_argb1), // %1
  5154. "+r"(dst_argb), // %2
  5155. "+r"(width) // %3
  5156. :
  5157. : "memory", "cc", "xmm0");
  5158. }
  5159. #endif // HAS_ARGBSUBTRACTROW_AVX2
  5160. #ifdef HAS_SOBELXROW_SSE2
  5161. // SobelX as a matrix is
  5162. // -1 0 1
  5163. // -2 0 2
  5164. // -1 0 1
  5165. void SobelXRow_SSE2(const uint8_t* src_y0,
  5166. const uint8_t* src_y1,
  5167. const uint8_t* src_y2,
  5168. uint8_t* dst_sobelx,
  5169. int width) {
  5170. asm volatile(
  5171. "sub %0,%1 \n"
  5172. "sub %0,%2 \n"
  5173. "sub %0,%3 \n"
  5174. "pxor %%xmm5,%%xmm5 \n"
  5175. // 8 pixel loop.
  5176. LABELALIGN
  5177. "1: \n"
  5178. "movq (%0),%%xmm0 \n"
  5179. "movq 0x2(%0),%%xmm1 \n"
  5180. "punpcklbw %%xmm5,%%xmm0 \n"
  5181. "punpcklbw %%xmm5,%%xmm1 \n"
  5182. "psubw %%xmm1,%%xmm0 \n"
  5183. "movq 0x00(%0,%1,1),%%xmm1 \n"
  5184. "movq 0x02(%0,%1,1),%%xmm2 \n"
  5185. "punpcklbw %%xmm5,%%xmm1 \n"
  5186. "punpcklbw %%xmm5,%%xmm2 \n"
  5187. "psubw %%xmm2,%%xmm1 \n"
  5188. "movq 0x00(%0,%2,1),%%xmm2 \n"
  5189. "movq 0x02(%0,%2,1),%%xmm3 \n"
  5190. "punpcklbw %%xmm5,%%xmm2 \n"
  5191. "punpcklbw %%xmm5,%%xmm3 \n"
  5192. "psubw %%xmm3,%%xmm2 \n"
  5193. "paddw %%xmm2,%%xmm0 \n"
  5194. "paddw %%xmm1,%%xmm0 \n"
  5195. "paddw %%xmm1,%%xmm0 \n"
  5196. "pxor %%xmm1,%%xmm1 \n"
  5197. "psubw %%xmm0,%%xmm1 \n"
  5198. "pmaxsw %%xmm1,%%xmm0 \n"
  5199. "packuswb %%xmm0,%%xmm0 \n"
  5200. "movq %%xmm0,0x00(%0,%3,1) \n"
  5201. "lea 0x8(%0),%0 \n"
  5202. "sub $0x8,%4 \n"
  5203. "jg 1b \n"
  5204. : "+r"(src_y0), // %0
  5205. "+r"(src_y1), // %1
  5206. "+r"(src_y2), // %2
  5207. "+r"(dst_sobelx), // %3
  5208. "+r"(width) // %4
  5209. :
  5210. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  5211. }
  5212. #endif // HAS_SOBELXROW_SSE2
  5213. #ifdef HAS_SOBELYROW_SSE2
  5214. // SobelY as a matrix is
  5215. // -1 -2 -1
  5216. // 0 0 0
  5217. // 1 2 1
  5218. void SobelYRow_SSE2(const uint8_t* src_y0,
  5219. const uint8_t* src_y1,
  5220. uint8_t* dst_sobely,
  5221. int width) {
  5222. asm volatile(
  5223. "sub %0,%1 \n"
  5224. "sub %0,%2 \n"
  5225. "pxor %%xmm5,%%xmm5 \n"
  5226. // 8 pixel loop.
  5227. LABELALIGN
  5228. "1: \n"
  5229. "movq (%0),%%xmm0 \n"
  5230. "movq 0x00(%0,%1,1),%%xmm1 \n"
  5231. "punpcklbw %%xmm5,%%xmm0 \n"
  5232. "punpcklbw %%xmm5,%%xmm1 \n"
  5233. "psubw %%xmm1,%%xmm0 \n"
  5234. "movq 0x1(%0),%%xmm1 \n"
  5235. "movq 0x01(%0,%1,1),%%xmm2 \n"
  5236. "punpcklbw %%xmm5,%%xmm1 \n"
  5237. "punpcklbw %%xmm5,%%xmm2 \n"
  5238. "psubw %%xmm2,%%xmm1 \n"
  5239. "movq 0x2(%0),%%xmm2 \n"
  5240. "movq 0x02(%0,%1,1),%%xmm3 \n"
  5241. "punpcklbw %%xmm5,%%xmm2 \n"
  5242. "punpcklbw %%xmm5,%%xmm3 \n"
  5243. "psubw %%xmm3,%%xmm2 \n"
  5244. "paddw %%xmm2,%%xmm0 \n"
  5245. "paddw %%xmm1,%%xmm0 \n"
  5246. "paddw %%xmm1,%%xmm0 \n"
  5247. "pxor %%xmm1,%%xmm1 \n"
  5248. "psubw %%xmm0,%%xmm1 \n"
  5249. "pmaxsw %%xmm1,%%xmm0 \n"
  5250. "packuswb %%xmm0,%%xmm0 \n"
  5251. "movq %%xmm0,0x00(%0,%2,1) \n"
  5252. "lea 0x8(%0),%0 \n"
  5253. "sub $0x8,%3 \n"
  5254. "jg 1b \n"
  5255. : "+r"(src_y0), // %0
  5256. "+r"(src_y1), // %1
  5257. "+r"(dst_sobely), // %2
  5258. "+r"(width) // %3
  5259. :
  5260. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  5261. }
  5262. #endif // HAS_SOBELYROW_SSE2
  5263. #ifdef HAS_SOBELROW_SSE2
  5264. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  5265. // A = 255
  5266. // R = Sobel
  5267. // G = Sobel
  5268. // B = Sobel
  5269. void SobelRow_SSE2(const uint8_t* src_sobelx,
  5270. const uint8_t* src_sobely,
  5271. uint8_t* dst_argb,
  5272. int width) {
  5273. asm volatile(
  5274. "sub %0,%1 \n"
  5275. "pcmpeqb %%xmm5,%%xmm5 \n"
  5276. "pslld $0x18,%%xmm5 \n"
  5277. // 8 pixel loop.
  5278. LABELALIGN
  5279. "1: \n"
  5280. "movdqu (%0),%%xmm0 \n"
  5281. "movdqu 0x00(%0,%1,1),%%xmm1 \n"
  5282. "lea 0x10(%0),%0 \n"
  5283. "paddusb %%xmm1,%%xmm0 \n"
  5284. "movdqa %%xmm0,%%xmm2 \n"
  5285. "punpcklbw %%xmm0,%%xmm2 \n"
  5286. "punpckhbw %%xmm0,%%xmm0 \n"
  5287. "movdqa %%xmm2,%%xmm1 \n"
  5288. "punpcklwd %%xmm2,%%xmm1 \n"
  5289. "punpckhwd %%xmm2,%%xmm2 \n"
  5290. "por %%xmm5,%%xmm1 \n"
  5291. "por %%xmm5,%%xmm2 \n"
  5292. "movdqa %%xmm0,%%xmm3 \n"
  5293. "punpcklwd %%xmm0,%%xmm3 \n"
  5294. "punpckhwd %%xmm0,%%xmm0 \n"
  5295. "por %%xmm5,%%xmm3 \n"
  5296. "por %%xmm5,%%xmm0 \n"
  5297. "movdqu %%xmm1,(%2) \n"
  5298. "movdqu %%xmm2,0x10(%2) \n"
  5299. "movdqu %%xmm3,0x20(%2) \n"
  5300. "movdqu %%xmm0,0x30(%2) \n"
  5301. "lea 0x40(%2),%2 \n"
  5302. "sub $0x10,%3 \n"
  5303. "jg 1b \n"
  5304. : "+r"(src_sobelx), // %0
  5305. "+r"(src_sobely), // %1
  5306. "+r"(dst_argb), // %2
  5307. "+r"(width) // %3
  5308. :
  5309. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  5310. }
  5311. #endif // HAS_SOBELROW_SSE2
  5312. #ifdef HAS_SOBELTOPLANEROW_SSE2
  5313. // Adds Sobel X and Sobel Y and stores Sobel into a plane.
  5314. void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
  5315. const uint8_t* src_sobely,
  5316. uint8_t* dst_y,
  5317. int width) {
  5318. asm volatile(
  5319. "sub %0,%1 \n"
  5320. "pcmpeqb %%xmm5,%%xmm5 \n"
  5321. "pslld $0x18,%%xmm5 \n"
  5322. // 8 pixel loop.
  5323. LABELALIGN
  5324. "1: \n"
  5325. "movdqu (%0),%%xmm0 \n"
  5326. "movdqu 0x00(%0,%1,1),%%xmm1 \n"
  5327. "lea 0x10(%0),%0 \n"
  5328. "paddusb %%xmm1,%%xmm0 \n"
  5329. "movdqu %%xmm0,(%2) \n"
  5330. "lea 0x10(%2),%2 \n"
  5331. "sub $0x10,%3 \n"
  5332. "jg 1b \n"
  5333. : "+r"(src_sobelx), // %0
  5334. "+r"(src_sobely), // %1
  5335. "+r"(dst_y), // %2
  5336. "+r"(width) // %3
  5337. :
  5338. : "memory", "cc", "xmm0", "xmm1");
  5339. }
  5340. #endif // HAS_SOBELTOPLANEROW_SSE2
  5341. #ifdef HAS_SOBELXYROW_SSE2
  5342. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  5343. // A = 255
  5344. // R = Sobel X
  5345. // G = Sobel
  5346. // B = Sobel Y
  5347. void SobelXYRow_SSE2(const uint8_t* src_sobelx,
  5348. const uint8_t* src_sobely,
  5349. uint8_t* dst_argb,
  5350. int width) {
  5351. asm volatile(
  5352. "sub %0,%1 \n"
  5353. "pcmpeqb %%xmm5,%%xmm5 \n"
  5354. // 8 pixel loop.
  5355. LABELALIGN
  5356. "1: \n"
  5357. "movdqu (%0),%%xmm0 \n"
  5358. "movdqu 0x00(%0,%1,1),%%xmm1 \n"
  5359. "lea 0x10(%0),%0 \n"
  5360. "movdqa %%xmm0,%%xmm2 \n"
  5361. "paddusb %%xmm1,%%xmm2 \n"
  5362. "movdqa %%xmm0,%%xmm3 \n"
  5363. "punpcklbw %%xmm5,%%xmm3 \n"
  5364. "punpckhbw %%xmm5,%%xmm0 \n"
  5365. "movdqa %%xmm1,%%xmm4 \n"
  5366. "punpcklbw %%xmm2,%%xmm4 \n"
  5367. "punpckhbw %%xmm2,%%xmm1 \n"
  5368. "movdqa %%xmm4,%%xmm6 \n"
  5369. "punpcklwd %%xmm3,%%xmm6 \n"
  5370. "punpckhwd %%xmm3,%%xmm4 \n"
  5371. "movdqa %%xmm1,%%xmm7 \n"
  5372. "punpcklwd %%xmm0,%%xmm7 \n"
  5373. "punpckhwd %%xmm0,%%xmm1 \n"
  5374. "movdqu %%xmm6,(%2) \n"
  5375. "movdqu %%xmm4,0x10(%2) \n"
  5376. "movdqu %%xmm7,0x20(%2) \n"
  5377. "movdqu %%xmm1,0x30(%2) \n"
  5378. "lea 0x40(%2),%2 \n"
  5379. "sub $0x10,%3 \n"
  5380. "jg 1b \n"
  5381. : "+r"(src_sobelx), // %0
  5382. "+r"(src_sobely), // %1
  5383. "+r"(dst_argb), // %2
  5384. "+r"(width) // %3
  5385. :
  5386. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  5387. "xmm7");
  5388. }
  5389. #endif // HAS_SOBELXYROW_SSE2
  5390. #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
  5391. // Creates a table of cumulative sums where each value is a sum of all values
  5392. // above and to the left of the value, inclusive of the value.
  5393. void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
  5394. int32_t* cumsum,
  5395. const int32_t* previous_cumsum,
  5396. int width) {
  5397. asm volatile(
  5398. "pxor %%xmm0,%%xmm0 \n"
  5399. "pxor %%xmm1,%%xmm1 \n"
  5400. "sub $0x4,%3 \n"
  5401. "jl 49f \n"
  5402. "test $0xf,%1 \n"
  5403. "jne 49f \n"
  5404. // 4 pixel loop.
  5405. LABELALIGN
  5406. "40: \n"
  5407. "movdqu (%0),%%xmm2 \n"
  5408. "lea 0x10(%0),%0 \n"
  5409. "movdqa %%xmm2,%%xmm4 \n"
  5410. "punpcklbw %%xmm1,%%xmm2 \n"
  5411. "movdqa %%xmm2,%%xmm3 \n"
  5412. "punpcklwd %%xmm1,%%xmm2 \n"
  5413. "punpckhwd %%xmm1,%%xmm3 \n"
  5414. "punpckhbw %%xmm1,%%xmm4 \n"
  5415. "movdqa %%xmm4,%%xmm5 \n"
  5416. "punpcklwd %%xmm1,%%xmm4 \n"
  5417. "punpckhwd %%xmm1,%%xmm5 \n"
  5418. "paddd %%xmm2,%%xmm0 \n"
  5419. "movdqu (%2),%%xmm2 \n"
  5420. "paddd %%xmm0,%%xmm2 \n"
  5421. "paddd %%xmm3,%%xmm0 \n"
  5422. "movdqu 0x10(%2),%%xmm3 \n"
  5423. "paddd %%xmm0,%%xmm3 \n"
  5424. "paddd %%xmm4,%%xmm0 \n"
  5425. "movdqu 0x20(%2),%%xmm4 \n"
  5426. "paddd %%xmm0,%%xmm4 \n"
  5427. "paddd %%xmm5,%%xmm0 \n"
  5428. "movdqu 0x30(%2),%%xmm5 \n"
  5429. "lea 0x40(%2),%2 \n"
  5430. "paddd %%xmm0,%%xmm5 \n"
  5431. "movdqu %%xmm2,(%1) \n"
  5432. "movdqu %%xmm3,0x10(%1) \n"
  5433. "movdqu %%xmm4,0x20(%1) \n"
  5434. "movdqu %%xmm5,0x30(%1) \n"
  5435. "lea 0x40(%1),%1 \n"
  5436. "sub $0x4,%3 \n"
  5437. "jge 40b \n"
  5438. "49: \n"
  5439. "add $0x3,%3 \n"
  5440. "jl 19f \n"
  5441. // 1 pixel loop.
  5442. LABELALIGN
  5443. "10: \n"
  5444. "movd (%0),%%xmm2 \n"
  5445. "lea 0x4(%0),%0 \n"
  5446. "punpcklbw %%xmm1,%%xmm2 \n"
  5447. "punpcklwd %%xmm1,%%xmm2 \n"
  5448. "paddd %%xmm2,%%xmm0 \n"
  5449. "movdqu (%2),%%xmm2 \n"
  5450. "lea 0x10(%2),%2 \n"
  5451. "paddd %%xmm0,%%xmm2 \n"
  5452. "movdqu %%xmm2,(%1) \n"
  5453. "lea 0x10(%1),%1 \n"
  5454. "sub $0x1,%3 \n"
  5455. "jge 10b \n"
  5456. "19: \n"
  5457. : "+r"(row), // %0
  5458. "+r"(cumsum), // %1
  5459. "+r"(previous_cumsum), // %2
  5460. "+r"(width) // %3
  5461. :
  5462. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  5463. }
  5464. #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
  5465. #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  5466. void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
  5467. const int32_t* botleft,
  5468. int width,
  5469. int area,
  5470. uint8_t* dst,
  5471. int count) {
  5472. asm volatile(
  5473. "movd %5,%%xmm5 \n"
  5474. "cvtdq2ps %%xmm5,%%xmm5 \n"
  5475. "rcpss %%xmm5,%%xmm4 \n"
  5476. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  5477. "sub $0x4,%3 \n"
  5478. "jl 49f \n"
  5479. "cmpl $0x80,%5 \n"
  5480. "ja 40f \n"
  5481. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  5482. "pcmpeqb %%xmm6,%%xmm6 \n"
  5483. "psrld $0x10,%%xmm6 \n"
  5484. "cvtdq2ps %%xmm6,%%xmm6 \n"
  5485. "addps %%xmm6,%%xmm5 \n"
  5486. "mulps %%xmm4,%%xmm5 \n"
  5487. "cvtps2dq %%xmm5,%%xmm5 \n"
  5488. "packssdw %%xmm5,%%xmm5 \n"
  5489. // 4 pixel small loop.
  5490. LABELALIGN
  5491. "4: \n"
  5492. "movdqu (%0),%%xmm0 \n"
  5493. "movdqu 0x10(%0),%%xmm1 \n"
  5494. "movdqu 0x20(%0),%%xmm2 \n"
  5495. "movdqu 0x30(%0),%%xmm3 \n"
  5496. "psubd 0x00(%0,%4,4),%%xmm0 \n"
  5497. "psubd 0x10(%0,%4,4),%%xmm1 \n"
  5498. "psubd 0x20(%0,%4,4),%%xmm2 \n"
  5499. "psubd 0x30(%0,%4,4),%%xmm3 \n"
  5500. "lea 0x40(%0),%0 \n"
  5501. "psubd (%1),%%xmm0 \n"
  5502. "psubd 0x10(%1),%%xmm1 \n"
  5503. "psubd 0x20(%1),%%xmm2 \n"
  5504. "psubd 0x30(%1),%%xmm3 \n"
  5505. "paddd 0x00(%1,%4,4),%%xmm0 \n"
  5506. "paddd 0x10(%1,%4,4),%%xmm1 \n"
  5507. "paddd 0x20(%1,%4,4),%%xmm2 \n"
  5508. "paddd 0x30(%1,%4,4),%%xmm3 \n"
  5509. "lea 0x40(%1),%1 \n"
  5510. "packssdw %%xmm1,%%xmm0 \n"
  5511. "packssdw %%xmm3,%%xmm2 \n"
  5512. "pmulhuw %%xmm5,%%xmm0 \n"
  5513. "pmulhuw %%xmm5,%%xmm2 \n"
  5514. "packuswb %%xmm2,%%xmm0 \n"
  5515. "movdqu %%xmm0,(%2) \n"
  5516. "lea 0x10(%2),%2 \n"
  5517. "sub $0x4,%3 \n"
  5518. "jge 4b \n"
  5519. "jmp 49f \n"
  5520. // 4 pixel loop
  5521. LABELALIGN
  5522. "40: \n"
  5523. "movdqu (%0),%%xmm0 \n"
  5524. "movdqu 0x10(%0),%%xmm1 \n"
  5525. "movdqu 0x20(%0),%%xmm2 \n"
  5526. "movdqu 0x30(%0),%%xmm3 \n"
  5527. "psubd 0x00(%0,%4,4),%%xmm0 \n"
  5528. "psubd 0x10(%0,%4,4),%%xmm1 \n"
  5529. "psubd 0x20(%0,%4,4),%%xmm2 \n"
  5530. "psubd 0x30(%0,%4,4),%%xmm3 \n"
  5531. "lea 0x40(%0),%0 \n"
  5532. "psubd (%1),%%xmm0 \n"
  5533. "psubd 0x10(%1),%%xmm1 \n"
  5534. "psubd 0x20(%1),%%xmm2 \n"
  5535. "psubd 0x30(%1),%%xmm3 \n"
  5536. "paddd 0x00(%1,%4,4),%%xmm0 \n"
  5537. "paddd 0x10(%1,%4,4),%%xmm1 \n"
  5538. "paddd 0x20(%1,%4,4),%%xmm2 \n"
  5539. "paddd 0x30(%1,%4,4),%%xmm3 \n"
  5540. "lea 0x40(%1),%1 \n"
  5541. "cvtdq2ps %%xmm0,%%xmm0 \n"
  5542. "cvtdq2ps %%xmm1,%%xmm1 \n"
  5543. "mulps %%xmm4,%%xmm0 \n"
  5544. "mulps %%xmm4,%%xmm1 \n"
  5545. "cvtdq2ps %%xmm2,%%xmm2 \n"
  5546. "cvtdq2ps %%xmm3,%%xmm3 \n"
  5547. "mulps %%xmm4,%%xmm2 \n"
  5548. "mulps %%xmm4,%%xmm3 \n"
  5549. "cvtps2dq %%xmm0,%%xmm0 \n"
  5550. "cvtps2dq %%xmm1,%%xmm1 \n"
  5551. "cvtps2dq %%xmm2,%%xmm2 \n"
  5552. "cvtps2dq %%xmm3,%%xmm3 \n"
  5553. "packssdw %%xmm1,%%xmm0 \n"
  5554. "packssdw %%xmm3,%%xmm2 \n"
  5555. "packuswb %%xmm2,%%xmm0 \n"
  5556. "movdqu %%xmm0,(%2) \n"
  5557. "lea 0x10(%2),%2 \n"
  5558. "sub $0x4,%3 \n"
  5559. "jge 40b \n"
  5560. "49: \n"
  5561. "add $0x3,%3 \n"
  5562. "jl 19f \n"
  5563. // 1 pixel loop
  5564. LABELALIGN
  5565. "10: \n"
  5566. "movdqu (%0),%%xmm0 \n"
  5567. "psubd 0x00(%0,%4,4),%%xmm0 \n"
  5568. "lea 0x10(%0),%0 \n"
  5569. "psubd (%1),%%xmm0 \n"
  5570. "paddd 0x00(%1,%4,4),%%xmm0 \n"
  5571. "lea 0x10(%1),%1 \n"
  5572. "cvtdq2ps %%xmm0,%%xmm0 \n"
  5573. "mulps %%xmm4,%%xmm0 \n"
  5574. "cvtps2dq %%xmm0,%%xmm0 \n"
  5575. "packssdw %%xmm0,%%xmm0 \n"
  5576. "packuswb %%xmm0,%%xmm0 \n"
  5577. "movd %%xmm0,(%2) \n"
  5578. "lea 0x4(%2),%2 \n"
  5579. "sub $0x1,%3 \n"
  5580. "jge 10b \n"
  5581. "19: \n"
  5582. : "+r"(topleft), // %0
  5583. "+r"(botleft), // %1
  5584. "+r"(dst), // %2
  5585. "+rm"(count) // %3
  5586. : "r"((intptr_t)(width)), // %4
  5587. "rm"(area) // %5
  5588. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  5589. }
  5590. #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
  5591. #ifdef HAS_ARGBAFFINEROW_SSE2
  5592. // Copy ARGB pixels from source image with slope to a row of destination.
  5593. LIBYUV_API
  5594. void ARGBAffineRow_SSE2(const uint8_t* src_argb,
  5595. int src_argb_stride,
  5596. uint8_t* dst_argb,
  5597. const float* src_dudv,
  5598. int width) {
  5599. intptr_t src_argb_stride_temp = src_argb_stride;
  5600. intptr_t temp;
  5601. asm volatile(
  5602. "movq (%3),%%xmm2 \n"
  5603. "movq 0x08(%3),%%xmm7 \n"
  5604. "shl $0x10,%1 \n"
  5605. "add $0x4,%1 \n"
  5606. "movd %1,%%xmm5 \n"
  5607. "sub $0x4,%4 \n"
  5608. "jl 49f \n"
  5609. "pshufd $0x44,%%xmm7,%%xmm7 \n"
  5610. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  5611. "movdqa %%xmm2,%%xmm0 \n"
  5612. "addps %%xmm7,%%xmm0 \n"
  5613. "movlhps %%xmm0,%%xmm2 \n"
  5614. "movdqa %%xmm7,%%xmm4 \n"
  5615. "addps %%xmm4,%%xmm4 \n"
  5616. "movdqa %%xmm2,%%xmm3 \n"
  5617. "addps %%xmm4,%%xmm3 \n"
  5618. "addps %%xmm4,%%xmm4 \n"
  5619. // 4 pixel loop
  5620. LABELALIGN
  5621. "40: \n"
  5622. "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
  5623. "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
  5624. "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
  5625. "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
  5626. "movd %%xmm0,%k1 \n"
  5627. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5628. "movd %%xmm0,%k5 \n"
  5629. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5630. "movd 0x00(%0,%1,1),%%xmm1 \n"
  5631. "movd 0x00(%0,%5,1),%%xmm6 \n"
  5632. "punpckldq %%xmm6,%%xmm1 \n"
  5633. "addps %%xmm4,%%xmm2 \n"
  5634. "movq %%xmm1,(%2) \n"
  5635. "movd %%xmm0,%k1 \n"
  5636. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  5637. "movd %%xmm0,%k5 \n"
  5638. "movd 0x00(%0,%1,1),%%xmm0 \n"
  5639. "movd 0x00(%0,%5,1),%%xmm6 \n"
  5640. "punpckldq %%xmm6,%%xmm0 \n"
  5641. "addps %%xmm4,%%xmm3 \n"
  5642. "movq %%xmm0,0x08(%2) \n"
  5643. "lea 0x10(%2),%2 \n"
  5644. "sub $0x4,%4 \n"
  5645. "jge 40b \n"
  5646. "49: \n"
  5647. "add $0x3,%4 \n"
  5648. "jl 19f \n"
  5649. // 1 pixel loop
  5650. LABELALIGN
  5651. "10: \n"
  5652. "cvttps2dq %%xmm2,%%xmm0 \n"
  5653. "packssdw %%xmm0,%%xmm0 \n"
  5654. "pmaddwd %%xmm5,%%xmm0 \n"
  5655. "addps %%xmm7,%%xmm2 \n"
  5656. "movd %%xmm0,%k1 \n"
  5657. "movd 0x00(%0,%1,1),%%xmm0 \n"
  5658. "movd %%xmm0,(%2) \n"
  5659. "lea 0x04(%2),%2 \n"
  5660. "sub $0x1,%4 \n"
  5661. "jge 10b \n"
  5662. "19: \n"
  5663. : "+r"(src_argb), // %0
  5664. "+r"(src_argb_stride_temp), // %1
  5665. "+r"(dst_argb), // %2
  5666. "+r"(src_dudv), // %3
  5667. "+rm"(width), // %4
  5668. "=&r"(temp) // %5
  5669. :
  5670. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  5671. "xmm7");
  5672. }
  5673. #endif // HAS_ARGBAFFINEROW_SSE2
  5674. #ifdef HAS_INTERPOLATEROW_SSSE3
  5675. // Bilinear filter 16x2 -> 16x1
  5676. void InterpolateRow_SSSE3(uint8_t* dst_ptr,
  5677. const uint8_t* src_ptr,
  5678. ptrdiff_t src_stride,
  5679. int dst_width,
  5680. int source_y_fraction) {
  5681. asm volatile(
  5682. "sub %1,%0 \n"
  5683. "cmp $0x0,%3 \n"
  5684. "je 100f \n"
  5685. "cmp $0x80,%3 \n"
  5686. "je 50f \n"
  5687. "movd %3,%%xmm0 \n"
  5688. "neg %3 \n"
  5689. "add $0x100,%3 \n"
  5690. "movd %3,%%xmm5 \n"
  5691. "punpcklbw %%xmm0,%%xmm5 \n"
  5692. "punpcklwd %%xmm5,%%xmm5 \n"
  5693. "pshufd $0x0,%%xmm5,%%xmm5 \n"
  5694. "mov $0x80808080,%%eax \n"
  5695. "movd %%eax,%%xmm4 \n"
  5696. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  5697. // General purpose row blend.
  5698. LABELALIGN
  5699. "1: \n"
  5700. "movdqu (%1),%%xmm0 \n"
  5701. "movdqu 0x00(%1,%4,1),%%xmm2 \n"
  5702. "movdqa %%xmm0,%%xmm1 \n"
  5703. "punpcklbw %%xmm2,%%xmm0 \n"
  5704. "punpckhbw %%xmm2,%%xmm1 \n"
  5705. "psubb %%xmm4,%%xmm0 \n"
  5706. "psubb %%xmm4,%%xmm1 \n"
  5707. "movdqa %%xmm5,%%xmm2 \n"
  5708. "movdqa %%xmm5,%%xmm3 \n"
  5709. "pmaddubsw %%xmm0,%%xmm2 \n"
  5710. "pmaddubsw %%xmm1,%%xmm3 \n"
  5711. "paddw %%xmm4,%%xmm2 \n"
  5712. "paddw %%xmm4,%%xmm3 \n"
  5713. "psrlw $0x8,%%xmm2 \n"
  5714. "psrlw $0x8,%%xmm3 \n"
  5715. "packuswb %%xmm3,%%xmm2 \n"
  5716. "movdqu %%xmm2,0x00(%1,%0,1) \n"
  5717. "lea 0x10(%1),%1 \n"
  5718. "sub $0x10,%2 \n"
  5719. "jg 1b \n"
  5720. "jmp 99f \n"
  5721. // Blend 50 / 50.
  5722. LABELALIGN
  5723. "50: \n"
  5724. "movdqu (%1),%%xmm0 \n"
  5725. "movdqu 0x00(%1,%4,1),%%xmm1 \n"
  5726. "pavgb %%xmm1,%%xmm0 \n"
  5727. "movdqu %%xmm0,0x00(%1,%0,1) \n"
  5728. "lea 0x10(%1),%1 \n"
  5729. "sub $0x10,%2 \n"
  5730. "jg 50b \n"
  5731. "jmp 99f \n"
  5732. // Blend 100 / 0 - Copy row unchanged.
  5733. LABELALIGN
  5734. "100: \n"
  5735. "movdqu (%1),%%xmm0 \n"
  5736. "movdqu %%xmm0,0x00(%1,%0,1) \n"
  5737. "lea 0x10(%1),%1 \n"
  5738. "sub $0x10,%2 \n"
  5739. "jg 100b \n"
  5740. "99: \n"
  5741. : "+r"(dst_ptr), // %0
  5742. "+r"(src_ptr), // %1
  5743. "+rm"(dst_width), // %2
  5744. "+r"(source_y_fraction) // %3
  5745. : "r"((intptr_t)(src_stride)) // %4
  5746. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  5747. }
  5748. #endif // HAS_INTERPOLATEROW_SSSE3
  5749. #ifdef HAS_INTERPOLATEROW_AVX2
  5750. // Bilinear filter 32x2 -> 32x1
  5751. void InterpolateRow_AVX2(uint8_t* dst_ptr,
  5752. const uint8_t* src_ptr,
  5753. ptrdiff_t src_stride,
  5754. int dst_width,
  5755. int source_y_fraction) {
  5756. asm volatile(
  5757. "cmp $0x0,%3 \n"
  5758. "je 100f \n"
  5759. "sub %1,%0 \n"
  5760. "cmp $0x80,%3 \n"
  5761. "je 50f \n"
  5762. "vmovd %3,%%xmm0 \n"
  5763. "neg %3 \n"
  5764. "add $0x100,%3 \n"
  5765. "vmovd %3,%%xmm5 \n"
  5766. "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
  5767. "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
  5768. "vbroadcastss %%xmm5,%%ymm5 \n"
  5769. "mov $0x80808080,%%eax \n"
  5770. "vmovd %%eax,%%xmm4 \n"
  5771. "vbroadcastss %%xmm4,%%ymm4 \n"
  5772. // General purpose row blend.
  5773. LABELALIGN
  5774. "1: \n"
  5775. "vmovdqu (%1),%%ymm0 \n"
  5776. "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
  5777. "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
  5778. "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
  5779. "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
  5780. "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
  5781. "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
  5782. "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
  5783. "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
  5784. "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
  5785. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  5786. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  5787. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  5788. "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
  5789. "lea 0x20(%1),%1 \n"
  5790. "sub $0x20,%2 \n"
  5791. "jg 1b \n"
  5792. "jmp 99f \n"
  5793. // Blend 50 / 50.
  5794. LABELALIGN
  5795. "50: \n"
  5796. "vmovdqu (%1),%%ymm0 \n"
  5797. "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
  5798. "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
  5799. "lea 0x20(%1),%1 \n"
  5800. "sub $0x20,%2 \n"
  5801. "jg 50b \n"
  5802. "jmp 99f \n"
  5803. // Blend 100 / 0 - Copy row unchanged.
  5804. LABELALIGN
  5805. "100: \n"
  5806. "rep movsb \n"
  5807. "jmp 999f \n"
  5808. "99: \n"
  5809. "vzeroupper \n"
  5810. "999: \n"
  5811. : "+D"(dst_ptr), // %0
  5812. "+S"(src_ptr), // %1
  5813. "+cm"(dst_width), // %2
  5814. "+r"(source_y_fraction) // %3
  5815. : "r"((intptr_t)(src_stride)) // %4
  5816. : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
  5817. }
  5818. #endif // HAS_INTERPOLATEROW_AVX2
  5819. #ifdef HAS_ARGBSHUFFLEROW_SSSE3
  5820. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5821. void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
  5822. uint8_t* dst_argb,
  5823. const uint8_t* shuffler,
  5824. int width) {
  5825. asm volatile(
  5826. "movdqu (%3),%%xmm5 \n"
  5827. LABELALIGN
  5828. "1: \n"
  5829. "movdqu (%0),%%xmm0 \n"
  5830. "movdqu 0x10(%0),%%xmm1 \n"
  5831. "lea 0x20(%0),%0 \n"
  5832. "pshufb %%xmm5,%%xmm0 \n"
  5833. "pshufb %%xmm5,%%xmm1 \n"
  5834. "movdqu %%xmm0,(%1) \n"
  5835. "movdqu %%xmm1,0x10(%1) \n"
  5836. "lea 0x20(%1),%1 \n"
  5837. "sub $0x8,%2 \n"
  5838. "jg 1b \n"
  5839. : "+r"(src_argb), // %0
  5840. "+r"(dst_argb), // %1
  5841. "+r"(width) // %2
  5842. : "r"(shuffler) // %3
  5843. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  5844. }
  5845. #endif // HAS_ARGBSHUFFLEROW_SSSE3
  5846. #ifdef HAS_ARGBSHUFFLEROW_AVX2
  5847. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  5848. void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
  5849. uint8_t* dst_argb,
  5850. const uint8_t* shuffler,
  5851. int width) {
  5852. asm volatile(
  5853. "vbroadcastf128 (%3),%%ymm5 \n"
  5854. LABELALIGN
  5855. "1: \n"
  5856. "vmovdqu (%0),%%ymm0 \n"
  5857. "vmovdqu 0x20(%0),%%ymm1 \n"
  5858. "lea 0x40(%0),%0 \n"
  5859. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
  5860. "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
  5861. "vmovdqu %%ymm0,(%1) \n"
  5862. "vmovdqu %%ymm1,0x20(%1) \n"
  5863. "lea 0x40(%1),%1 \n"
  5864. "sub $0x10,%2 \n"
  5865. "jg 1b \n"
  5866. "vzeroupper \n"
  5867. : "+r"(src_argb), // %0
  5868. "+r"(dst_argb), // %1
  5869. "+r"(width) // %2
  5870. : "r"(shuffler) // %3
  5871. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  5872. }
  5873. #endif // HAS_ARGBSHUFFLEROW_AVX2
  5874. #ifdef HAS_I422TOYUY2ROW_SSE2
  5875. void I422ToYUY2Row_SSE2(const uint8_t* src_y,
  5876. const uint8_t* src_u,
  5877. const uint8_t* src_v,
  5878. uint8_t* dst_yuy2,
  5879. int width) {
  5880. asm volatile(
  5881. "sub %1,%2 \n"
  5882. LABELALIGN
  5883. "1: \n"
  5884. "movq (%1),%%xmm2 \n"
  5885. "movq 0x00(%1,%2,1),%%xmm1 \n"
  5886. "add $0x8,%1 \n"
  5887. "punpcklbw %%xmm1,%%xmm2 \n"
  5888. "movdqu (%0),%%xmm0 \n"
  5889. "add $0x10,%0 \n"
  5890. "movdqa %%xmm0,%%xmm1 \n"
  5891. "punpcklbw %%xmm2,%%xmm0 \n"
  5892. "punpckhbw %%xmm2,%%xmm1 \n"
  5893. "movdqu %%xmm0,(%3) \n"
  5894. "movdqu %%xmm1,0x10(%3) \n"
  5895. "lea 0x20(%3),%3 \n"
  5896. "sub $0x10,%4 \n"
  5897. "jg 1b \n"
  5898. : "+r"(src_y), // %0
  5899. "+r"(src_u), // %1
  5900. "+r"(src_v), // %2
  5901. "+r"(dst_yuy2), // %3
  5902. "+rm"(width) // %4
  5903. :
  5904. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  5905. }
  5906. #endif // HAS_I422TOYUY2ROW_SSE2
  5907. #ifdef HAS_I422TOUYVYROW_SSE2
  5908. void I422ToUYVYRow_SSE2(const uint8_t* src_y,
  5909. const uint8_t* src_u,
  5910. const uint8_t* src_v,
  5911. uint8_t* dst_uyvy,
  5912. int width) {
  5913. asm volatile(
  5914. "sub %1,%2 \n"
  5915. LABELALIGN
  5916. "1: \n"
  5917. "movq (%1),%%xmm2 \n"
  5918. "movq 0x00(%1,%2,1),%%xmm1 \n"
  5919. "add $0x8,%1 \n"
  5920. "punpcklbw %%xmm1,%%xmm2 \n"
  5921. "movdqu (%0),%%xmm0 \n"
  5922. "movdqa %%xmm2,%%xmm1 \n"
  5923. "add $0x10,%0 \n"
  5924. "punpcklbw %%xmm0,%%xmm1 \n"
  5925. "punpckhbw %%xmm0,%%xmm2 \n"
  5926. "movdqu %%xmm1,(%3) \n"
  5927. "movdqu %%xmm2,0x10(%3) \n"
  5928. "lea 0x20(%3),%3 \n"
  5929. "sub $0x10,%4 \n"
  5930. "jg 1b \n"
  5931. : "+r"(src_y), // %0
  5932. "+r"(src_u), // %1
  5933. "+r"(src_v), // %2
  5934. "+r"(dst_uyvy), // %3
  5935. "+rm"(width) // %4
  5936. :
  5937. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  5938. }
  5939. #endif // HAS_I422TOUYVYROW_SSE2
  5940. #ifdef HAS_I422TOYUY2ROW_AVX2
  5941. void I422ToYUY2Row_AVX2(const uint8_t* src_y,
  5942. const uint8_t* src_u,
  5943. const uint8_t* src_v,
  5944. uint8_t* dst_yuy2,
  5945. int width) {
  5946. asm volatile(
  5947. "sub %1,%2 \n"
  5948. LABELALIGN
  5949. "1: \n"
  5950. "vpmovzxbw (%1),%%ymm1 \n"
  5951. "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
  5952. "add $0x10,%1 \n"
  5953. "vpsllw $0x8,%%ymm2,%%ymm2 \n"
  5954. "vpor %%ymm1,%%ymm2,%%ymm2 \n"
  5955. "vmovdqu (%0),%%ymm0 \n"
  5956. "add $0x20,%0 \n"
  5957. "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
  5958. "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
  5959. "vextractf128 $0x0,%%ymm1,(%3) \n"
  5960. "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
  5961. "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
  5962. "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
  5963. "lea 0x40(%3),%3 \n"
  5964. "sub $0x20,%4 \n"
  5965. "jg 1b \n"
  5966. "vzeroupper \n"
  5967. : "+r"(src_y), // %0
  5968. "+r"(src_u), // %1
  5969. "+r"(src_v), // %2
  5970. "+r"(dst_yuy2), // %3
  5971. "+rm"(width) // %4
  5972. :
  5973. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  5974. }
  5975. #endif // HAS_I422TOYUY2ROW_AVX2
  5976. #ifdef HAS_I422TOUYVYROW_AVX2
  5977. void I422ToUYVYRow_AVX2(const uint8_t* src_y,
  5978. const uint8_t* src_u,
  5979. const uint8_t* src_v,
  5980. uint8_t* dst_uyvy,
  5981. int width) {
  5982. asm volatile(
  5983. "sub %1,%2 \n"
  5984. LABELALIGN
  5985. "1: \n"
  5986. "vpmovzxbw (%1),%%ymm1 \n"
  5987. "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
  5988. "add $0x10,%1 \n"
  5989. "vpsllw $0x8,%%ymm2,%%ymm2 \n"
  5990. "vpor %%ymm1,%%ymm2,%%ymm2 \n"
  5991. "vmovdqu (%0),%%ymm0 \n"
  5992. "add $0x20,%0 \n"
  5993. "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
  5994. "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
  5995. "vextractf128 $0x0,%%ymm1,(%3) \n"
  5996. "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
  5997. "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
  5998. "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
  5999. "lea 0x40(%3),%3 \n"
  6000. "sub $0x20,%4 \n"
  6001. "jg 1b \n"
  6002. "vzeroupper \n"
  6003. : "+r"(src_y), // %0
  6004. "+r"(src_u), // %1
  6005. "+r"(src_v), // %2
  6006. "+r"(dst_uyvy), // %3
  6007. "+rm"(width) // %4
  6008. :
  6009. : "memory", "cc", "xmm0", "xmm1", "xmm2");
  6010. }
  6011. #endif // HAS_I422TOUYVYROW_AVX2
  6012. #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
  6013. void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
  6014. uint8_t* dst_argb,
  6015. const float* poly,
  6016. int width) {
  6017. asm volatile(
  6018. "pxor %%xmm3,%%xmm3 \n"
  6019. // 2 pixel loop.
  6020. LABELALIGN
  6021. "1: \n"
  6022. "movq (%0),%%xmm0 \n"
  6023. "lea 0x8(%0),%0 \n"
  6024. "punpcklbw %%xmm3,%%xmm0 \n"
  6025. "movdqa %%xmm0,%%xmm4 \n"
  6026. "punpcklwd %%xmm3,%%xmm0 \n"
  6027. "punpckhwd %%xmm3,%%xmm4 \n"
  6028. "cvtdq2ps %%xmm0,%%xmm0 \n"
  6029. "cvtdq2ps %%xmm4,%%xmm4 \n"
  6030. "movdqa %%xmm0,%%xmm1 \n"
  6031. "movdqa %%xmm4,%%xmm5 \n"
  6032. "mulps 0x10(%3),%%xmm0 \n"
  6033. "mulps 0x10(%3),%%xmm4 \n"
  6034. "addps (%3),%%xmm0 \n"
  6035. "addps (%3),%%xmm4 \n"
  6036. "movdqa %%xmm1,%%xmm2 \n"
  6037. "movdqa %%xmm5,%%xmm6 \n"
  6038. "mulps %%xmm1,%%xmm2 \n"
  6039. "mulps %%xmm5,%%xmm6 \n"
  6040. "mulps %%xmm2,%%xmm1 \n"
  6041. "mulps %%xmm6,%%xmm5 \n"
  6042. "mulps 0x20(%3),%%xmm2 \n"
  6043. "mulps 0x20(%3),%%xmm6 \n"
  6044. "mulps 0x30(%3),%%xmm1 \n"
  6045. "mulps 0x30(%3),%%xmm5 \n"
  6046. "addps %%xmm2,%%xmm0 \n"
  6047. "addps %%xmm6,%%xmm4 \n"
  6048. "addps %%xmm1,%%xmm0 \n"
  6049. "addps %%xmm5,%%xmm4 \n"
  6050. "cvttps2dq %%xmm0,%%xmm0 \n"
  6051. "cvttps2dq %%xmm4,%%xmm4 \n"
  6052. "packuswb %%xmm4,%%xmm0 \n"
  6053. "packuswb %%xmm0,%%xmm0 \n"
  6054. "movq %%xmm0,(%1) \n"
  6055. "lea 0x8(%1),%1 \n"
  6056. "sub $0x2,%2 \n"
  6057. "jg 1b \n"
  6058. : "+r"(src_argb), // %0
  6059. "+r"(dst_argb), // %1
  6060. "+r"(width) // %2
  6061. : "r"(poly) // %3
  6062. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  6063. }
  6064. #endif // HAS_ARGBPOLYNOMIALROW_SSE2
  6065. #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
  6066. void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
  6067. uint8_t* dst_argb,
  6068. const float* poly,
  6069. int width) {
  6070. asm volatile(
  6071. "vbroadcastf128 (%3),%%ymm4 \n"
  6072. "vbroadcastf128 0x10(%3),%%ymm5 \n"
  6073. "vbroadcastf128 0x20(%3),%%ymm6 \n"
  6074. "vbroadcastf128 0x30(%3),%%ymm7 \n"
  6075. // 2 pixel loop.
  6076. LABELALIGN
  6077. "1: \n"
  6078. "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels
  6079. "lea 0x8(%0),%0 \n"
  6080. "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
  6081. "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
  6082. "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
  6083. "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
  6084. "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
  6085. "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X *
  6086. // X
  6087. "vcvttps2dq %%ymm0,%%ymm0 \n"
  6088. "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
  6089. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  6090. "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
  6091. "vmovq %%xmm0,(%1) \n"
  6092. "lea 0x8(%1),%1 \n"
  6093. "sub $0x2,%2 \n"
  6094. "jg 1b \n"
  6095. "vzeroupper \n"
  6096. : "+r"(src_argb), // %0
  6097. "+r"(dst_argb), // %1
  6098. "+r"(width) // %2
  6099. : "r"(poly) // %3
  6100. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  6101. "xmm7");
  6102. }
  6103. #endif // HAS_ARGBPOLYNOMIALROW_AVX2
  6104. #ifdef HAS_HALFFLOATROW_SSE2
  6105. static float kScaleBias = 1.9259299444e-34f;
  6106. void HalfFloatRow_SSE2(const uint16_t* src,
  6107. uint16_t* dst,
  6108. float scale,
  6109. int width) {
  6110. scale *= kScaleBias;
  6111. asm volatile(
  6112. "movd %3,%%xmm4 \n"
  6113. "pshufd $0x0,%%xmm4,%%xmm4 \n"
  6114. "pxor %%xmm5,%%xmm5 \n"
  6115. "sub %0,%1 \n"
  6116. // 16 pixel loop.
  6117. LABELALIGN
  6118. "1: \n"
  6119. "movdqu (%0),%%xmm2 \n" // 8 shorts
  6120. "add $0x10,%0 \n"
  6121. "movdqa %%xmm2,%%xmm3 \n"
  6122. "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
  6123. "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
  6124. "punpckhwd %%xmm5,%%xmm3 \n"
  6125. "cvtdq2ps %%xmm3,%%xmm3 \n"
  6126. "mulps %%xmm4,%%xmm2 \n"
  6127. "mulps %%xmm4,%%xmm3 \n"
  6128. "psrld $0xd,%%xmm2 \n"
  6129. "psrld $0xd,%%xmm3 \n"
  6130. "packssdw %%xmm3,%%xmm2 \n"
  6131. "movdqu %%xmm2,-0x10(%0,%1,1) \n"
  6132. "sub $0x8,%2 \n"
  6133. "jg 1b \n"
  6134. : "+r"(src), // %0
  6135. "+r"(dst), // %1
  6136. "+r"(width) // %2
  6137. : "m"(scale) // %3
  6138. : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
  6139. }
  6140. #endif // HAS_HALFFLOATROW_SSE2
  6141. #ifdef HAS_HALFFLOATROW_AVX2
  6142. void HalfFloatRow_AVX2(const uint16_t* src,
  6143. uint16_t* dst,
  6144. float scale,
  6145. int width) {
  6146. scale *= kScaleBias;
  6147. asm volatile(
  6148. "vbroadcastss %3, %%ymm4 \n"
  6149. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  6150. "sub %0,%1 \n"
  6151. // 16 pixel loop.
  6152. LABELALIGN
  6153. "1: \n"
  6154. "vmovdqu (%0),%%ymm2 \n" // 16 shorts
  6155. "add $0x20,%0 \n"
  6156. "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
  6157. "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
  6158. "vcvtdq2ps %%ymm3,%%ymm3 \n"
  6159. "vcvtdq2ps %%ymm2,%%ymm2 \n"
  6160. "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
  6161. "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
  6162. "vpsrld $0xd,%%ymm3,%%ymm3 \n"
  6163. "vpsrld $0xd,%%ymm2,%%ymm2 \n"
  6164. "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
  6165. "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
  6166. "sub $0x10,%2 \n"
  6167. "jg 1b \n"
  6168. "vzeroupper \n"
  6169. : "+r"(src), // %0
  6170. "+r"(dst), // %1
  6171. "+r"(width) // %2
  6172. #if defined(__x86_64__)
  6173. : "x"(scale) // %3
  6174. #else
  6175. : "m"(scale) // %3
  6176. #endif
  6177. : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
  6178. }
  6179. #endif // HAS_HALFFLOATROW_AVX2
  6180. #ifdef HAS_HALFFLOATROW_F16C
  6181. void HalfFloatRow_F16C(const uint16_t* src,
  6182. uint16_t* dst,
  6183. float scale,
  6184. int width) {
  6185. asm volatile(
  6186. "vbroadcastss %3, %%ymm4 \n"
  6187. "sub %0,%1 \n"
  6188. // 16 pixel loop.
  6189. LABELALIGN
  6190. "1: \n"
  6191. "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
  6192. "vpmovzxwd 0x10(%0),%%ymm3 \n"
  6193. "vcvtdq2ps %%ymm2,%%ymm2 \n"
  6194. "vcvtdq2ps %%ymm3,%%ymm3 \n"
  6195. "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
  6196. "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
  6197. "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
  6198. "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
  6199. "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
  6200. "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
  6201. "add $0x20,%0 \n"
  6202. "sub $0x10,%2 \n"
  6203. "jg 1b \n"
  6204. "vzeroupper \n"
  6205. : "+r"(src), // %0
  6206. "+r"(dst), // %1
  6207. "+r"(width) // %2
  6208. #if defined(__x86_64__)
  6209. : "x"(scale) // %3
  6210. #else
  6211. : "m"(scale) // %3
  6212. #endif
  6213. : "memory", "cc", "xmm2", "xmm3", "xmm4");
  6214. }
  6215. #endif // HAS_HALFFLOATROW_F16C
  6216. #ifdef HAS_HALFFLOATROW_F16C
  6217. void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
  6218. asm volatile(
  6219. "sub %0,%1 \n"
  6220. // 16 pixel loop.
  6221. LABELALIGN
  6222. "1: \n"
  6223. "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
  6224. "vpmovzxwd 0x10(%0),%%ymm3 \n"
  6225. "vcvtdq2ps %%ymm2,%%ymm2 \n"
  6226. "vcvtdq2ps %%ymm3,%%ymm3 \n"
  6227. "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
  6228. "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
  6229. "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
  6230. "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
  6231. "add $0x20,%0 \n"
  6232. "sub $0x10,%2 \n"
  6233. "jg 1b \n"
  6234. "vzeroupper \n"
  6235. : "+r"(src), // %0
  6236. "+r"(dst), // %1
  6237. "+r"(width) // %2
  6238. :
  6239. : "memory", "cc", "xmm2", "xmm3");
  6240. }
  6241. #endif // HAS_HALFFLOATROW_F16C
  6242. #ifdef HAS_ARGBCOLORTABLEROW_X86
  6243. // Tranform ARGB pixels with color table.
  6244. void ARGBColorTableRow_X86(uint8_t* dst_argb,
  6245. const uint8_t* table_argb,
  6246. int width) {
  6247. uintptr_t pixel_temp;
  6248. asm volatile(
  6249. // 1 pixel loop.
  6250. LABELALIGN
  6251. "1: \n"
  6252. "movzb (%0),%1 \n"
  6253. "lea 0x4(%0),%0 \n"
  6254. "movzb 0x00(%3,%1,4),%1 \n"
  6255. "mov %b1,-0x4(%0) \n"
  6256. "movzb -0x3(%0),%1 \n"
  6257. "movzb 0x01(%3,%1,4),%1 \n"
  6258. "mov %b1,-0x3(%0) \n"
  6259. "movzb -0x2(%0),%1 \n"
  6260. "movzb 0x02(%3,%1,4),%1 \n"
  6261. "mov %b1,-0x2(%0) \n"
  6262. "movzb -0x1(%0),%1 \n"
  6263. "movzb 0x03(%3,%1,4),%1 \n"
  6264. "mov %b1,-0x1(%0) \n"
  6265. "dec %2 \n"
  6266. "jg 1b \n"
  6267. : "+r"(dst_argb), // %0
  6268. "=&d"(pixel_temp), // %1
  6269. "+r"(width) // %2
  6270. : "r"(table_argb) // %3
  6271. : "memory", "cc");
  6272. }
  6273. #endif // HAS_ARGBCOLORTABLEROW_X86
  6274. #ifdef HAS_RGBCOLORTABLEROW_X86
  6275. // Tranform RGB pixels with color table.
  6276. void RGBColorTableRow_X86(uint8_t* dst_argb,
  6277. const uint8_t* table_argb,
  6278. int width) {
  6279. uintptr_t pixel_temp;
  6280. asm volatile(
  6281. // 1 pixel loop.
  6282. LABELALIGN
  6283. "1: \n"
  6284. "movzb (%0),%1 \n"
  6285. "lea 0x4(%0),%0 \n"
  6286. "movzb 0x00(%3,%1,4),%1 \n"
  6287. "mov %b1,-0x4(%0) \n"
  6288. "movzb -0x3(%0),%1 \n"
  6289. "movzb 0x01(%3,%1,4),%1 \n"
  6290. "mov %b1,-0x3(%0) \n"
  6291. "movzb -0x2(%0),%1 \n"
  6292. "movzb 0x02(%3,%1,4),%1 \n"
  6293. "mov %b1,-0x2(%0) \n"
  6294. "dec %2 \n"
  6295. "jg 1b \n"
  6296. : "+r"(dst_argb), // %0
  6297. "=&d"(pixel_temp), // %1
  6298. "+r"(width) // %2
  6299. : "r"(table_argb) // %3
  6300. : "memory", "cc");
  6301. }
  6302. #endif // HAS_RGBCOLORTABLEROW_X86
  6303. #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
  6304. // Tranform RGB pixels with luma table.
  6305. void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
  6306. uint8_t* dst_argb,
  6307. int width,
  6308. const uint8_t* luma,
  6309. uint32_t lumacoeff) {
  6310. uintptr_t pixel_temp;
  6311. uintptr_t table_temp;
  6312. asm volatile(
  6313. "movd %6,%%xmm3 \n"
  6314. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  6315. "pcmpeqb %%xmm4,%%xmm4 \n"
  6316. "psllw $0x8,%%xmm4 \n"
  6317. "pxor %%xmm5,%%xmm5 \n"
  6318. // 4 pixel loop.
  6319. LABELALIGN
  6320. "1: \n"
  6321. "movdqu (%2),%%xmm0 \n"
  6322. "pmaddubsw %%xmm3,%%xmm0 \n"
  6323. "phaddw %%xmm0,%%xmm0 \n"
  6324. "pand %%xmm4,%%xmm0 \n"
  6325. "punpcklwd %%xmm5,%%xmm0 \n"
  6326. "movd %%xmm0,%k1 \n" // 32 bit offset
  6327. "add %5,%1 \n"
  6328. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  6329. "movzb (%2),%0 \n"
  6330. "movzb 0x00(%1,%0,1),%0 \n"
  6331. "mov %b0,(%3) \n"
  6332. "movzb 0x1(%2),%0 \n"
  6333. "movzb 0x00(%1,%0,1),%0 \n"
  6334. "mov %b0,0x1(%3) \n"
  6335. "movzb 0x2(%2),%0 \n"
  6336. "movzb 0x00(%1,%0,1),%0 \n"
  6337. "mov %b0,0x2(%3) \n"
  6338. "movzb 0x3(%2),%0 \n"
  6339. "mov %b0,0x3(%3) \n"
  6340. "movd %%xmm0,%k1 \n" // 32 bit offset
  6341. "add %5,%1 \n"
  6342. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  6343. "movzb 0x4(%2),%0 \n"
  6344. "movzb 0x00(%1,%0,1),%0 \n"
  6345. "mov %b0,0x4(%3) \n"
  6346. "movzb 0x5(%2),%0 \n"
  6347. "movzb 0x00(%1,%0,1),%0 \n"
  6348. "mov %b0,0x5(%3) \n"
  6349. "movzb 0x6(%2),%0 \n"
  6350. "movzb 0x00(%1,%0,1),%0 \n"
  6351. "mov %b0,0x6(%3) \n"
  6352. "movzb 0x7(%2),%0 \n"
  6353. "mov %b0,0x7(%3) \n"
  6354. "movd %%xmm0,%k1 \n" // 32 bit offset
  6355. "add %5,%1 \n"
  6356. "pshufd $0x39,%%xmm0,%%xmm0 \n"
  6357. "movzb 0x8(%2),%0 \n"
  6358. "movzb 0x00(%1,%0,1),%0 \n"
  6359. "mov %b0,0x8(%3) \n"
  6360. "movzb 0x9(%2),%0 \n"
  6361. "movzb 0x00(%1,%0,1),%0 \n"
  6362. "mov %b0,0x9(%3) \n"
  6363. "movzb 0xa(%2),%0 \n"
  6364. "movzb 0x00(%1,%0,1),%0 \n"
  6365. "mov %b0,0xa(%3) \n"
  6366. "movzb 0xb(%2),%0 \n"
  6367. "mov %b0,0xb(%3) \n"
  6368. "movd %%xmm0,%k1 \n" // 32 bit offset
  6369. "add %5,%1 \n"
  6370. "movzb 0xc(%2),%0 \n"
  6371. "movzb 0x00(%1,%0,1),%0 \n"
  6372. "mov %b0,0xc(%3) \n"
  6373. "movzb 0xd(%2),%0 \n"
  6374. "movzb 0x00(%1,%0,1),%0 \n"
  6375. "mov %b0,0xd(%3) \n"
  6376. "movzb 0xe(%2),%0 \n"
  6377. "movzb 0x00(%1,%0,1),%0 \n"
  6378. "mov %b0,0xe(%3) \n"
  6379. "movzb 0xf(%2),%0 \n"
  6380. "mov %b0,0xf(%3) \n"
  6381. "lea 0x10(%2),%2 \n"
  6382. "lea 0x10(%3),%3 \n"
  6383. "sub $0x4,%4 \n"
  6384. "jg 1b \n"
  6385. : "=&d"(pixel_temp), // %0
  6386. "=&a"(table_temp), // %1
  6387. "+r"(src_argb), // %2
  6388. "+r"(dst_argb), // %3
  6389. "+rm"(width) // %4
  6390. : "r"(luma), // %5
  6391. "rm"(lumacoeff) // %6
  6392. : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
  6393. }
  6394. #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
  6395. #ifdef HAS_NV21TOYUV24ROW_AVX2
  6396. // begin NV21ToYUV24Row_C avx2 constants
  6397. static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
  6398. 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
  6399. 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
  6400. 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
  6401. static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
  6402. 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
  6403. 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
  6404. 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
  6405. static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
  6406. 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
  6407. 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
  6408. 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
  6409. static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
  6410. 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
  6411. 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
  6412. 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
  6413. static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
  6414. 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
  6415. 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
  6416. 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
  6417. static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
  6418. 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
  6419. 0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
  6420. 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
  6421. static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
  6422. 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
  6423. 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
  6424. 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
  6425. static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
  6426. 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
  6427. 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
  6428. 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
  6429. static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
  6430. 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
  6431. 0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
  6432. 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
  6433. // NV21ToYUV24Row_AVX2
  6434. void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
  6435. const uint8_t* src_vu,
  6436. uint8_t* dst_yuv24,
  6437. int width) {
  6438. uint8_t* src_y_ptr;
  6439. uint64_t src_offset = 0;
  6440. uint64_t width64;
  6441. width64 = width;
  6442. src_y_ptr = (uint8_t*)src_y;
  6443. asm volatile(
  6444. "vmovdqu %5, %%ymm0 \n" // init blend value
  6445. "vmovdqu %6, %%ymm1 \n" // init blend value
  6446. "vmovdqu %7, %%ymm2 \n" // init blend value
  6447. // "sub $0x20, %3 \n" //sub 32 from width for final loop
  6448. LABELALIGN
  6449. "1: \n" // label 1
  6450. "vmovdqu (%0,%4), %%ymm3 \n" // src_y
  6451. "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
  6452. "vmovdqu (%1), %%ymm5 \n" // src_uv
  6453. "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
  6454. "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
  6455. // shuf
  6456. "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
  6457. // shuf
  6458. "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
  6459. "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
  6460. // shuf
  6461. "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
  6462. "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
  6463. "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
  6464. "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
  6465. "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
  6466. "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
  6467. "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
  6468. "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
  6469. "add $0x20, %4 \n" // add to src buffer
  6470. // ptr
  6471. "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
  6472. "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
  6473. "vmovdqu %%ymm4, (%2) \n" // store dst_yuv
  6474. "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
  6475. "add $0x60,%2 \n" // add to dst buffer
  6476. // ptr
  6477. // "cmp %3, %4 \n" //(width64 -
  6478. // 32 bytes) and src_offset
  6479. "sub $0x20,%3 \n" // 32 pixels per loop
  6480. "jg 1b \n"
  6481. "vzeroupper \n" // sse-avx2
  6482. // transistions
  6483. : "+r"(src_y), //%0
  6484. "+r"(src_vu), //%1
  6485. "+r"(dst_yuv24), //%2
  6486. "+r"(width64), //%3
  6487. "+r"(src_offset) //%4
  6488. : "m"(kBLEND0), //%5
  6489. "m"(kBLEND1), //%6
  6490. "m"(kBLEND2), //%7
  6491. "m"(kSHUF0), //%8
  6492. "m"(kSHUF1), //%9
  6493. "m"(kSHUF2), //%10
  6494. "m"(kSHUF3), //%11
  6495. "m"(kSHUF4), //%12
  6496. "m"(kSHUF5) //%13
  6497. : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
  6498. "xmm13", "xmm14", "xmm15");
  6499. }
  6500. #endif // HAS_NV21TOYUV24ROW_AVX2
  6501. #ifdef HAS_SWAPUVROW_SSSE3
  6502. // Shuffle table for reversing the bytes.
  6503. static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
  6504. 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
  6505. // Convert UV plane of NV12 to VU of NV21.
  6506. void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
  6507. asm volatile(
  6508. "movdqu %3,%%xmm5 \n"
  6509. LABELALIGN
  6510. "1: \n"
  6511. "movdqu (%0),%%xmm0 \n"
  6512. "movdqu 0x10(%0),%%xmm1 \n"
  6513. "lea 0x20(%0),%0 \n"
  6514. "pshufb %%xmm5,%%xmm0 \n"
  6515. "pshufb %%xmm5,%%xmm1 \n"
  6516. "movdqu %%xmm0,(%1) \n"
  6517. "movdqu %%xmm1,0x10(%1) \n"
  6518. "lea 0x20(%1),%1 \n"
  6519. "sub $0x10,%2 \n"
  6520. "jg 1b \n"
  6521. : "+r"(src_uv), // %0
  6522. "+r"(dst_vu), // %1
  6523. "+r"(width) // %2
  6524. : "m"(kShuffleUVToVU) // %3
  6525. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  6526. }
  6527. #endif // HAS_SWAPUVROW_SSSE3
  6528. #ifdef HAS_SWAPUVROW_AVX2
  6529. void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
  6530. asm volatile(
  6531. "vbroadcastf128 %3,%%ymm5 \n"
  6532. LABELALIGN
  6533. "1: \n"
  6534. "vmovdqu (%0),%%ymm0 \n"
  6535. "vmovdqu 0x20(%0),%%ymm1 \n"
  6536. "lea 0x40(%0),%0 \n"
  6537. "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
  6538. "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
  6539. "vmovdqu %%ymm0,(%1) \n"
  6540. "vmovdqu %%ymm1,0x20(%1) \n"
  6541. "lea 0x40(%1),%1 \n"
  6542. "sub $0x20,%2 \n"
  6543. "jg 1b \n"
  6544. "vzeroupper \n"
  6545. : "+r"(src_uv), // %0
  6546. "+r"(dst_vu), // %1
  6547. "+r"(width) // %2
  6548. : "m"(kShuffleUVToVU) // %3
  6549. : "memory", "cc", "xmm0", "xmm1", "xmm5");
  6550. }
  6551. #endif // HAS_SWAPUVROW_AVX2
  6552. void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
  6553. int src_stride_u,
  6554. const uint8_t* src_v,
  6555. int src_stride_v,
  6556. uint8_t* dst_uv,
  6557. int width) {
  6558. asm volatile(
  6559. "pcmpeqb %%xmm4,%%xmm4 \n"
  6560. "psrlw $0xf,%%xmm4 \n"
  6561. "packuswb %%xmm4,%%xmm4 \n"
  6562. "pxor %%xmm5,%%xmm5 \n"
  6563. "1: \n"
  6564. LABELALIGN
  6565. "1: \n"
  6566. "movdqu (%0),%%xmm0 \n" // load 16 U values
  6567. "movdqu (%1),%%xmm1 \n" // load 16 V values
  6568. "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row
  6569. "movdqu 0(%1,%5,1),%%xmm3 \n"
  6570. "lea 0x10(%0),%0 \n"
  6571. "pmaddubsw %%xmm4,%%xmm0 \n" // half size
  6572. "pmaddubsw %%xmm4,%%xmm1 \n"
  6573. "pmaddubsw %%xmm4,%%xmm2 \n"
  6574. "pmaddubsw %%xmm4,%%xmm3 \n"
  6575. "lea 0x10(%1),%1 \n"
  6576. "paddw %%xmm2,%%xmm0 \n"
  6577. "paddw %%xmm3,%%xmm1 \n"
  6578. "psrlw $0x1,%%xmm0 \n"
  6579. "psrlw $0x1,%%xmm1 \n"
  6580. "pavgw %%xmm5,%%xmm0 \n"
  6581. "pavgw %%xmm5,%%xmm1 \n"
  6582. "packuswb %%xmm0,%%xmm0 \n"
  6583. "packuswb %%xmm1,%%xmm1 \n"
  6584. "punpcklbw %%xmm1,%%xmm0 \n"
  6585. "movdqu %%xmm0,(%2) \n" // store 8 UV pixels
  6586. "lea 0x10(%2),%2 \n"
  6587. "sub $0x10,%3 \n" // 16 src pixels per loop
  6588. "jg 1b \n"
  6589. : "+r"(src_u), // %0
  6590. "+r"(src_v), // %1
  6591. "+r"(dst_uv), // %2
  6592. "+r"(width) // %3
  6593. : "r"((intptr_t)(src_stride_u)), // %4
  6594. "r"((intptr_t)(src_stride_v)) // %5
  6595. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  6596. }
  6597. void HalfMergeUVRow_AVX2(const uint8_t* src_u,
  6598. int src_stride_u,
  6599. const uint8_t* src_v,
  6600. int src_stride_v,
  6601. uint8_t* dst_uv,
  6602. int width) {
  6603. asm volatile(
  6604. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  6605. "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
  6606. "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
  6607. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  6608. "1: \n"
  6609. LABELALIGN
  6610. "1: \n"
  6611. "vmovdqu (%0),%%ymm0 \n" // load 32 U values
  6612. "vmovdqu (%1),%%ymm1 \n" // load 32 V values
  6613. "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
  6614. "vmovdqu 0(%1,%5,1),%%ymm3 \n"
  6615. "lea 0x20(%0),%0 \n"
  6616. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
  6617. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  6618. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  6619. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  6620. "lea 0x20(%1),%1 \n"
  6621. "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
  6622. "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
  6623. "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
  6624. "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
  6625. "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
  6626. "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
  6627. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  6628. "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
  6629. "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
  6630. "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
  6631. "lea 0x20(%2),%2 \n"
  6632. "sub $0x20,%3 \n" // 32 src pixels per loop
  6633. "jg 1b \n"
  6634. "vzeroupper \n"
  6635. : "+r"(src_u), // %0
  6636. "+r"(src_v), // %1
  6637. "+r"(dst_uv), // %2
  6638. "+r"(width) // %3
  6639. : "r"((intptr_t)(src_stride_u)), // %4
  6640. "r"((intptr_t)(src_stride_v)) // %5
  6641. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
  6642. }
  6643. #endif // defined(__x86_64__) || defined(__i386__)
  6644. #ifdef __cplusplus
  6645. } // extern "C"
  6646. } // namespace libyuv
  6647. #endif