line-tpami.tex 84 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119
  1. \documentclass[10pt,journal,cspaper,compsoc]{IEEEtran}
  2. \IEEEoverridecommandlockouts
  3. % Math related
  4. \usepackage{amsmath,amssymb}
  5. \usepackage{bbm}
  6. % \usepackage{ruler}
  7. \usepackage{mathtools}
  8. \DeclarePairedDelimiter\ceil{\lceil}{\rceil}
  9. \DeclarePairedDelimiter\floor{\lfloor}{\rfloor}
  10. \DeclareMathOperator*{\argmax}{arg\,max} % Jan Hlavacek
  11. \newcommand{\rectangle}{{%
  12. \ooalign{$\sqsubset\mkern3mu$\cr$\mkern3mu\sqsupset$\cr}%
  13. }}
  14. % table related
  15. \usepackage{colortbl}
  16. \usepackage{makecell}
  17. % Figure related
  18. \usepackage{graphicx, overpic, wrapfig, subfigure}
  19. \usepackage[dvipsnames]{xcolor}
  20. \definecolor{americanrose}{rgb}{1.0, 0.01, 0.24}
  21. \definecolor{myred}{rgb}{0.753, 0.314, 0.275}
  22. \definecolor{myblue}{rgb}{0.0, 0.24, 0.95}
  23. \definecolor{tbl_gray}{gray}{0.85}
  24. \newcommand\MYhyperrefoptions{bookmarks=true,bookmarksnumbered=true,
  25. pdfpagemode={UseOutlines},plainpages=false,pdfpagelabels=true,
  26. colorlinks=true,linkcolor={americanrose},citecolor={myblue},urlcolor={red},
  27. pdftitle={Deep Hough Transform for Semantic Line Detection},%<!CHANGE!
  28. pdfsubject={Typesetting},%<!CHANGE!
  29. pdfauthor={Kai Zhao et al.},%<!CHANGE!
  30. pdfkeywords={Straight line detection, Hough transform, CNN}}%<^!CHANGE!
  31. \usepackage[\MYhyperrefoptions,pdftex]{hyperref}
  32. % cref must be loaded after hyperref
  33. \usepackage{cleveref}
  34. \crefname{equation}{Eq.}{Eq.}
  35. \crefname{figure}{Fig.}{Fig.}
  36. \crefname{table}{Tab.}{Tab.}
  37. \crefname{section}{Sec.}{Sec.}
  38. \newcommand{\revise}[1]{{\textcolor{black}{#1}}}
  39. \newcommand{\rerevise}[1]{{\textcolor{black}{#1}}}
  40. %\newcommand{\GramaCheck}{}
  41. \ifdefined \GramaCheck
  42. \newcommand{\CheckRmv}[1]{}
  43. \renewcommand{\eqref}[1]{Equation 1}
  44. \else
  45. \newcommand{\CheckRmv}[1]{#1}
  46. \renewcommand{\eqref}[1]{Equation~(\ref{#1})}
  47. \fi
  48. \newcommand{\Arixv}{}
  49. \ifdefined \Arixv
  50. \newcommand{\ArxivRmv}[1]{}
  51. \else
  52. \newcommand{\ArxivRmv}[1]{#1}
  53. \fi
  54. \def\etal{\emph{et al.~}}
  55. \def\ie{\emph{i.e.,~}}
  56. \def\eg{\emph{e.g.,~}}
  57. \newcommand{\todo}[1]{{\textcolor{red}{#1}}}%
  58. \newcommand{\hq}[1]{{\textcolor{blue}{#1}}}
  59. % table related
  60. \usepackage{booktabs} % provides toprule
  61. \usepackage{array}
  62. \usepackage{diagbox}
  63. \usepackage{multirow}
  64. \usepackage{colortbl}
  65. \usepackage{silence}
  66. \hbadness=10000 \vbadness=10000
  67. \WarningFilter{latex}{Font shape declaration has incorrect series value}
  68. \graphicspath{{./figures/}{./figures/photos/}}
  69. \usepackage[nocompress]{cite}
  70. \hyphenation{op-tical net-works semi-conduc-tor}
  71. \begin{document}
  72. \title{Deep Hough Transform for Semantic Line Detection}
  73. \author{
  74. \IEEEauthorblockN{Kai Zhao\thanks{\IEEEauthorrefmark{1} The first two students contribute equally to this paper.}\IEEEauthorrefmark{1}},
  75. Qi Han\IEEEauthorrefmark{1},
  76. Chang-Bin Zhang,
  77. Jun Xu,
  78. \IEEEauthorblockN{Ming-Ming Cheng\thanks{\IEEEauthorrefmark{2} M.M. Cheng is the corresponding author (cmm@nankai.edu.cn).}\IEEEauthorrefmark{2}},~\IEEEmembership{Senior Member,~IEEE}
  79. \IEEEcompsocitemizethanks{
  80. \IEEEcompsocthanksitem Kai Zhao, Qi Han, Chang-Bin Zhang, and Ming-Ming Cheng are with the TKLNDST,
  81. College of Computer Science, Nankai University, Tianjin, China, 300350.
  82. %
  83. \IEEEcompsocthanksitem Jun Xu is with the School of Statistics and Data Science, Nankai University, Tianjin, China, 300071.
  84. \IEEEcompsocthanksitem A preliminary version of this work has been presented in \cite{eccv2020line}.
  85. }\\
  86. }
  87. % paper headers
  88. \markboth{IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. XX, NO. XX, April.~2021}%
  89. {Zhao \MakeLowercase{\textit{et al.}}:
  90. Deep Hough Transform for Semantic Line Detection}
  91. %{Shell \MakeLowercase{\textit{et al.}}: Bare Advanced Demo of IEEEtran.cls for IEEE Computer Society Journals}
  92. \IEEEtitleabstractindextext{%
  93. \begin{abstract}
  94. We focus on a fundamental task of detecting meaningful line structures,
  95. \textsl{a.k.a.}, semantic line, in natural scenes.
  96. %
  97. Many previous methods regard this problem as a special
  98. case of object detection
  99. and adjust existing object detectors for semantic line detection.
  100. %
  101. However, these methods neglect the inherent characteristics of lines,
  102. leading to sub-optimal performance.
  103. %
  104. Lines enjoy much simpler geometric property than complex objects
  105. and thus can be compactly parameterized by a few arguments.
  106. %
  107. To better exploit the property of lines, in this paper,
  108. we incorporate the classical Hough transform technique into
  109. deeply learned representations and
  110. propose a one-shot end-to-end learning framework for line detection.
  111. % Deep Hough Transform (DHT) network
  112. %
  113. By parameterizing lines with slopes and biases,
  114. we perform Hough transform to translate
  115. deep representations into the parametric domain,
  116. in which we perform line detection.
  117. %
  118. Specifically, we aggregate features along candidate lines on the
  119. feature map plane
  120. and then assign the aggregated features to corresponding locations
  121. in the parametric domain.
  122. %
  123. Consequently, the problem of detecting semantic lines in the spatial domain
  124. is transformed into spotting individual points in the parametric domain,
  125. making the post-processing steps,
  126. \ie non-maximal suppression, more efficient.
  127. %
  128. Furthermore, our method makes it easy to extract contextual line features
  129. % \eg features along lines close to a specific line,
  130. that are critical for accurate line detection.
  131. %
  132. In addition to the proposed method, we design an evaluation metric to assess the
  133. quality of line detection and construct a large scale dataset for the line detection task.
  134. %
  135. Experimental results on our proposed dataset and another public dataset
  136. demonstrate the advantages of our method over previous state-of-the-art
  137. alternatives.
  138. %
  139. % The \revise{dataset and} source code is available at~\url{https://mmcheng.net/dhtline/}.
  140. \end{abstract}
  141. % Note that keywords are not normally used for peerreview papers.
  142. \begin{IEEEkeywords}
  143. Semantic line detection, Hough transform, CNN, Deep Learning.
  144. \end{IEEEkeywords}
  145. }
  146. % make the title area
  147. \maketitle
  148. \IEEEdisplaynontitleabstractindextext
  149. \IEEEpeerreviewmaketitle
  150. \section{Introduction}\label{sec:introduction}
  151. \IEEEPARstart{D}etecting line structures from digital images has
  152. a long history in computer vision.
  153. %
  154. The organization of line structures is an early yet essential step to transform the visual signal into useful intermediate concepts
  155. for visual interpretation~\cite{burns1986extracting}.
  156. % %
  157. % Previous methods mainly focus on detecting arbitrary line segments using low-level features,
  158. % e.g., texture and gradient.
  159. %
  160. Though many techniques have been proposed to detect salient objects
  161. \cite{zhao2019optimizing,HouPami19Dss,gao2020sod100k,BorjiCVM2019,wang2021revisiting}
  162. and areas~\cite{cheng2015global,zhu2014saliency,Fan2020S4Net},
  163. little work has been made for detecting outstanding/structure-revealing line structures.
  164. %
  165. A recent study~\cite{lee2017semantic} was proposed to detect outstanding straight line(s),
  166. referred to as ``semantic line'', that outlines the conceptual structure of natural images.
  167. % ~\eg the straight lines that divides different semantic regions or highlight the structure
  168. % of scenes.
  169. %
  170. Identifying these semantic lines is of crucial importance for
  171. computer graphics and vision applications, such as
  172. photographic composition~\cite{liu2010optimizing,freeman2007photographer},
  173. structure-preserving image processing~\cite{TIP20_SP_NPR,hu2013patchnet},
  174. image aesthetic~\cite{ko2018pac,lee2019property,kong2016photo,mai2016composition},
  175. lane detection~\cite{fan2019spinnet},
  176. and artistic creation~\cite{krages2012photography,hu2013inverse,chen2009sketch2photo,zhang2020and}.
  177. %
  178. \revise{
  179. As demonstrated in~\cref{fig:photographic-composition},
  180. Liu \etal~\cite{liu2010optimizing} proposed to crop images according
  181. to the golden ratio by using `prominent line'.
  182. Detecting these `semantic lines' can help to produce images that
  183. are visually pleasing in the photographic composition.
  184. }
  185. %
  186. The Hough transform~\cite{duda1971use,ballard1981generating} is one
  187. representative method for line detection,
  188. which was first proposed to detect straight lines in bubble chamber
  189. photographs~\cite{hough1962method}.
  190. % generalized
  191. Since its simplicity and efficiency,
  192. HT is employed to detect lines in digital images~\cite{duda1971use},
  193. and further extended by~\cite{ballard1981generating} to detect other regular shapes like circles and rectangles.
  194. %
  195. The key idea of the Hough transform is to vote evidence from the image domain to the parametric domain, and
  196. then detect shapes in the parametric domain by identifying local-maximal responses.
  197. %
  198. In the case of line detection, a line in the image domain can be represented
  199. by its parameters, \eg slope, and \revise{offset} in the parametric space.
  200. %
  201. Hough transform collects evidence along with a line in an image
  202. and accumulates evidence to a single point in the parameter space.
  203. %
  204. Consequently, line detection in the image domain is converted to the problem of detecting peak responses in the
  205. parametric domain.
  206. %
  207. Classical Hough transform based line detectors
  208. ~\cite{fernandes2008real,yacoub1995hierarchical,princen1990hierarchical,kiryati1991probabilistic}
  209. usually detect continuous straight edges while neglecting the semantics in line structures.
  210. %
  211. Moreover, these methods are sensitive to light changes and occlusion.
  212. %
  213. Therefore, the results are often noisy and contain irrelevant lines~\cite{akinlar2011edlines},
  214. as shown in~\cref{fig:photographic-composition}(d).
  215. \newcommand{\addImg}[1]{\subfigure[]{\includegraphics[width=.245\linewidth]{figures/#1}}}
  216. \CheckRmv{
  217. \begin{figure*}[t]
  218. \centering
  219. \hfill
  220. \addImg{composition_1.pdf}
  221. \addImg{composition_2.pdf}
  222. \addImg{composition_3.pdf}
  223. \addImg{composition_4.pdf}
  224. \hfill
  225. \caption{
  226. Example pictures from~\cite{liu2010optimizing} reveal that semantic lines
  227. may help in the photographic composition.
  228. (a): a photo was taken with an arbitrary pose.
  229. %
  230. (b): a photo fits the golden ratio principle~\cite{caplin2008art,krages2012photography} which \revise{is} obtained by the method described
  231. in\cite{liu2010optimizing} using so-called `prominent lines'
  232. % and salient objects~\cite{gao2020sod100k,fan2020bbs,fan2019D3Net}
  233. in the image.
  234. %
  235. (c): Our detection results \revise{are} clean and comprise only a few meaningful
  236. lines that are potentially helpful in the photographic composition.
  237. %
  238. (d): Line detection results by the classical line detection algorithms
  239. often focus on fine detailed straight edges.
  240. %Hough transform is in chaos and full of noisy lines.
  241. }
  242. \label{fig:photographic-composition}
  243. \end{figure*}
  244. }
  245. Convolutional Neural Networks (CNNs) have achieved remarkable success in a wide range of computer vision tasks.
  246. %
  247. Several recent studies~\cite{lee2017semantic,zhang2019ppgnet} have proposed CNN-based methods for line detection.
  248. %
  249. Concretely, they regard line detection as a special case of object detection and employ existing object detectors
  250. \eg faster R-CNN~\cite{ren2015faster} or CornerNet~\cite{law2018cornernet},
  251. for line detection.
  252. %
  253. Limited by the ROI pooling and non-maximal suppression of lines,
  254. both~\cite{lee2017semantic} and~\cite{zhang2019ppgnet} are less efficient in terms of
  255. running time.
  256. %
  257. % For example,~\cite{lee2017semantic} takes 0.3 seconds to process an image.
  258. %
  259. Moreover, ROI pooling~\cite{girshick2015fast} aggregates features along with a
  260. single line, while many recent studies reveal that richer context information is critical to many
  261. tasks, \eg video classification~\cite{wang2018non} and semantic segmentation~\cite{huang2019ccnet}.
  262. %
  263. This point will be validated in~\cref{sec:ablation}, in which we experimentally verify that only aggregating features along a single line will produces sub-optimal results.
  264. Incorporate powerful CNNs to Hough transform is a promising direction for
  265. semantic line detection.
  266. %
  267. A simple way of combining CNN with Hough transform is performing edge detection
  268. with a CNN-based edge detector~\cite{RcfEdgePami2019,xie2015holistically}
  269. and then apply standard Hough transform to the edge maps.
  270. %
  271. However, the two components have diverse optimization targets, leading to sub-optimal results, as
  272. evidenced by our experiments.
  273. %
  274. In this paper, we propose to incorporate CNN with Hough transform into an end-to-end manner
  275. so that each component in our proposed method shares the same optimization target.
  276. %
  277. Our method first extracts pixel-wise representations with a CNN-based encoder
  278. and then performs Hough transform on the deep representations to convert representations
  279. from feature space into parametric space.
  280. %
  281. Then the global line detection problem is converted to simply detecting peak response in
  282. the transformed features, making the problem much simpler.
  283. %
  284. For example, the time-consuming non-maximal suppression (NMS) can be simply
  285. replaced by calculating the centroids of connected areas in the parametric space,
  286. making our method very efficient that can detect lines in real-time.
  287. %
  288. Moreover, in the detection stage, we use several convolutional layers on
  289. top of the transformed features to aggregate context-aware features of nearby lines.
  290. %
  291. Consequently, the final decision is made upon not only features of a single line,
  292. but also information about lines nearby.
  293. %
  294. As shown in \cref{fig:photographic-composition}(c),
  295. our method detects clean, meaningful and outstanding lines,
  296. that are helpful to photographic composition.
  297. To better evaluate line detection methods, we introduce a principled metric to assess
  298. the agreement of a detected line \textsl{w.r.t.} its corresponding ground-truth line.
  299. %
  300. Although~\cite{lee2017semantic} has proposed an evaluation metric that uses intersection areas to measure the similarity between a pair of lines,
  301. this measurement may lead to ambiguous and misleading results.
  302. %
  303. And at last, we collect a large scale dataset with \revise{6,500} carefully annotated images for semantic
  304. line detection.
  305. %
  306. The new dataset, namely \revise{NKL (short for \textbf{N}an\textbf{K}ai \textbf{L}ines),}
  307. contains images of diverse scenes,
  308. and the scale is much larger than the existing SEL~\cite{lee2017semantic} dataset
  309. in both terms of images and annotated lines.
  310. The contributions of this paper are summarized below:
  311. \begin{itemize}\setlength\itemsep{0.3em}
  312. \item We proposed an end-to-end framework for incorporating the feature learning capacity of CNN with Hough transform, resulting in an efficient real-time solution for semantic line detection.
  313. \item To facilitate the research of semantic line detection, we construct
  314. a new dataset with \revise{6,500} images, which is larger and more diverse than a previous SEL dataset~\cite{lee2017semantic}.
  315. \item We introduce a principled metric that measures the similarity between two lines.
  316. Compared with the previous IOU based metric~\cite{lee2017semantic},
  317. our metric has straightforward interpretation and \revise{simplicity in implementation},
  318. as detailed in~\cref{sec:metric}.
  319. \item Evaluation results on an open benchmark demonstrate that our method
  320. outperforms prior arts with a significant margin.
  321. \end{itemize}
  322. A preliminary version of this work was presented in~\cite{eccv2020line}.
  323. %
  324. In this extended work, we introduce three major improvements:
  325. \begin{itemize}\setlength\itemsep{0.3em}
  326. \item We propose a novel ``edge-guided refinement'' module to adjust line positions and obtain better detection performance with the help of accurate edge information.
  327. %
  328. This part is detailed in~\cref{sec:refine}.
  329. %
  330. \item We introduce a new large-scale dataset for semantic line detection, as presented in~\cref{sec:nkl-dataset}.
  331. %
  332. The new dataset, \revise{namely NKL
  333. (short for \textbf{N}an\textbf{K}ai \textbf{L}ines),
  334. contains 6,500 images in total}, and each image is annotated by multiple skilled annotators.
  335. %
  336. \item We employ the maximal bipartite graph matching~\cite{kuhn1955hungarian} to match ground-truth and detected lines during evaluation (\cref{sec:protocol}).
  337. %
  338. The matching procedure removes redundant true positives so that each ground-truth line is associated with
  339. at most one detected line and vice versa.
  340. \end{itemize}
  341. The rest of this paper is organized as follows:
  342. %
  343. \cref{sec:related-work} summarizes the related works.
  344. %
  345. \cref{sec:dht} elaborates the proposed Deep Hough transform method.
  346. %
  347. \cref{sec:metric} describes the proposed evaluating metric, which is used to assess the similarity between a pair of lines.
  348. %
  349. \cref{sec:nkl-dataset} introduces our newly constructed dataset.
  350. %
  351. \cref{sec:experiments} presents experimental details and report comparison results.
  352. %
  353. \cref{sec:conclusion} makes a conclusion remark.
  354. %-----------------------------------------------------------------------------------%
  355. \section{Related Work}\label{sec:related-work}
  356. %-----------------------------------------------------------------------------------%
  357. The research of line detection in digital images dates back to the very early stage of
  358. computer vision research.
  359. %
  360. Here, we first brief the evolution of Hough transform~\cite{duda1971use} (HT),
  361. one of the most fundamental tools, for line detection.
  362. %
  363. Then we introduce several recent CNN based methods for line detection.
  364. %
  365. At last, we summarize the methods and datasets for semantic line detection.
  366. \subsection{Hough transform}
  367. The Hough transform (HT) was firstly proposed in~\cite{hough1962method} for machine analysis of bubble chamber photographs.
  368. %
  369. It parametrizes straight lines with \revise{slope-offset}, leading to an unbounded transform space (since the slope can be infinity).
  370. %
  371. \cite{duda1971use} extended HT by using angle-radius rather than \revise{slope-offset}
  372. parameters, and is conceptually similar to two-dimensional Radom transform~\cite{radon20051}.
  373. %
  374. Then Ballard \etal \cite{ballard1981generating} generalized the idea of HT to localize arbitrary shapes, \eg ellipses and circles, from digital images.
  375. %
  376. For example, by parameterizing with angle and radius, line detection can be performed by voting edge evidence and finding peak response in the finite parametric space.
  377. %
  378. Typically, with the edge detectors such as Canny~\cite{canny1986computational} and Sobel~\cite{sobel},
  379. the detected lines are the maximal local response points in the transformed parametric space.
  380. %
  381. \revise{The core idea of HT is used in two recent
  382. works which
  383. parameterize the outputs of CNNs with offsets and orientations
  384. to predict surface meshes~\cite{chen2020bsp} or
  385. convex decomposition~\cite{deng2020cvxnet} of 3D shapes.}
  386. Despite the success of HT on line detection, it suffers from high computational costs and unstable performance.
  387. % There are many subsequent studies trying to remedy these shortcomings of the original algorithm.
  388. %
  389. To accelerate the voting of HT, Nahum \etal \cite{kiryati1991probabilistic}
  390. proposed the ``probabilistic Hough transform'' to randomly pick
  391. sample points from a line,
  392. while~\cite{finding1976} using the gradient direction of images to decide
  393. the voting points.
  394. %
  395. Meanwhile, the work of \cite{fernandes2008real,limberger2015real}
  396. employed kernel-based Hough transform to perform hough voting by using
  397. the elliptical-Gaussian kernel on collinear pixels to boost the original HT.
  398. %
  399. Besides, John \etal \cite{princen1990hierarchical,yacoub1995hierarchical} partitioned the input image into
  400. hierarchical image patches, and then applied HT independently to these patches.
  401. %
  402. Illingworth \etal \cite{illingworth1987adaptive} use a coarse-to-fine accumulation and search
  403. strategy to identify significant peaks in the Hough parametric spaces.
  404. %
  405. \cite{aggarwal2006line} tackled line detection within a
  406. regularized framework, to suppress the effect of noise and clutter corresponding to nonlinear image features.
  407. %
  408. The Hough voting scheme is also used in many other tasks such as
  409. detecting centroid of 3D shapes in point cloud~\cite{qi2019deep} and finding image correspondence~\cite{min2019hyperpixel}.
  410. % The research of line detection in digital images dates back to the very early
  411. % stage of computer vision.
  412. % %
  413. % Since the majority of line detection methods are based on the Hough transform~\cite{duda1971use},
  414. % we first brief the Hough transform, and then
  415. % summarize several early methods for line detection using Hough transform.
  416. % %
  417. % Finally, we describe two recently proposed CNN-based methods for line/segments
  418. % detection from natural images.
  419. % \subsection{Hough based line detectors.}
  420. % Hough transform (HT) is originally devised by Hough~\cite{hough1962method}
  421. % to detect straight lines from bubble chamber photographs.
  422. % %
  423. % The algorithm is then extended~\cite{duda1971use} and generalized
  424. % ~\cite{ballard1981generating} to localize arbitrary shapes,
  425. % \eg ellipses and circles, from digital images.
  426. % %
  427. % Traditional line detectors start by edge detection in an image, typically
  428. % with the Canny~\cite{canny1986computational} and Sobel~\cite{sobel} operators.
  429. % %
  430. % Then the next step is to apply the Hough transform and finally detect
  431. % lines by picking peak response in the transformed space.
  432. % %
  433. % HT collects edge response alone a line and accumulates them to
  434. % a single point in the parametric space.
  435. % There are many variants of Hough transform (HT) trying to remedy different
  436. % shortcomings of the original algorithm.
  437. % %
  438. % The original HT maps each image point to all points in the parameter space,
  439. % resulting in a many-to-many voting scheme.
  440. % %
  441. % Consequently, the original HT presents high computational cost,
  442. % especially when dealing with large-size images.
  443. % %
  444. % Nahum \etal \cite{kiryati1991probabilistic} try to accelerate HT
  445. % by proposing the `probabilistic Hough transform'
  446. % that randomly picks sample points from a line.
  447. % %
  448. % John \etal \cite{princen1990hierarchical,yacoub1995hierarchical} partition the input image into
  449. % hierarchical image patches, and then apply HT independently to these patches.
  450. % %
  451. % Fernandes \etal \cite{fernandes2008real} use an oriented elliptical-Gaussian kernel
  452. % to cast votes for only a few lines in the parameter space.
  453. % %
  454. % Illingworth \etal \cite{illingworth1987adaptive} use a `coarse to fine' accumulation and search
  455. % strategy to identify significant peaks in the Hough parametric spaces.
  456. % %
  457. % ~\cite{aggarwal2006line} approaches line detection within a
  458. % regularized framework, to suppress the effect of noise and clutter corresponding
  459. % to image features which are not linear.
  460. % %
  461. % It's worth noting that a clean input edge map is critical to these HT-based detectors.
  462. % %-----------------------------------------------------------------------------------%
  463. \subsection{Line Segments Detection}
  464. Though its robustness and parallelism, Hough transform cannot be directly used for line segments detection,
  465. since it cannot determine the endpoints of line segments.
  466. %
  467. Probabilistic Hough transform~\cite{kiryati1991probabilistic} uses random sampling in the voting scheme,
  468. and reconstructs line segments by localizing the sample locations.
  469. %
  470. But this method still prefers long straight lines.
  471. %
  472. In addition to Hough transform, many other studies have been developed to detect line segments.
  473. %
  474. Burns \etal \cite{burns1986extracting} used the edge orientation as the guide for line segments extraction.
  475. %
  476. The main advantage is that the orientation of the gradients can help to discover low-contrast lines
  477. and endpoints.
  478. %
  479. Etemadi \etal \cite{etemadi1992robust} established a chain from the given edge map and
  480. extracted line segments and orientations by walking over these chains.
  481. % quantized
  482. Chan \etal \cite{chan1996line} used a quantified edge orientation to search and merge short line segments.
  483. %
  484. Gioi \etal \cite{von2008lsd} proposed a linear-time line segment detector (LSD) without tuning parameters,
  485. and is used by many subsequent studies~\cite{akinlar2011edlines,akinlar2013edcircles,feng2013automatic}
  486. % 这里要给参考文献
  487. \subsection{CNN based Line Detection.}
  488. % There are two CNN-based line (segment) detectors that are closely related to
  489. % our method.
  490. %
  491. Recently, CNNs have brought remarkable improvements in computer vision tasks, and also be applied to line detection.
  492. %
  493. These methods either focus on straight line detection, \eg semantic line detection~\cite{lee2017semantic,ahmad2017comparison}, or line segments detection,
  494. \eg wireframe parsing~\cite{zhang2019ppgnet,huang2018learning,zhou2019end,xue2020holistically}.
  495. %
  496. Lee \etal \cite{lee2017semantic} followed the two-branch pipeline of faster-RCNN~\cite{ren2015faster}
  497. and proposed a straight line detection framework to find the meaningful semantic straight line in an image.
  498. %
  499. One branch verifies the existence of a line and the other branch further refines the position of the line
  500. by regression.
  501. %
  502. Zhang \etal \cite{zhang2019ppgnet} adopted the conception of CornerNet~\cite{law2018cornernet}
  503. to extract line segments as a pair of key points in indoor scenes.
  504. %
  505. Huang \etal \cite{huang2018learning} proposed a two-head network to predict lines and junction points for wireframe parsing.
  506. %
  507. This is extended in ~\cite{zhou2019end} by adding a line proposal sub-network.
  508. %
  509. Zhou \etal \cite{zhou2019end} proposed an end-to-end architecture to perform accurate line segments detection in wireframe parsing.
  510. All these methods extract line-wise feature vectors by LoI pooling that aggregate deep features solely along each line,
  511. leading to inadequate context information.
  512. \subsection{Semantic Line Detection}
  513. The meaningful straight line which helps photographic composition was firstly discussed in~\cite{lee2017semantic},
  514. and named as ``semantic line''.
  515. %
  516. \cite{lee2017semantic} regarded semantic line detection as a special case of object detection.
  517. %
  518. It first extracts CNN representations of line proposals using LoI pooling, which bilinearly interpolates the
  519. features along the entire straight line.
  520. %
  521. Then the line representations are verified by a classifier and a regressor, similar to Faster-RCNN~\cite{girshick2015fast}.
  522. %
  523. The line proposals are all unique lines in an image.
  524. %
  525. The metric of the intersection of union (IoU) of two straight lines is proposed in~\cite{lee2017semantic} to evaluate the similarity of two straight lines in an image.
  526. %
  527. This metric may produce ambiguous definitions in some scenarios, as will be mentioned in~\cref{sec:metric}.
  528. %
  529. Besides, Lee \etal \cite{lee2017semantic} collected a semantic line detection dataset which
  530. contains about 1,700 outdoor images, and most of them are natural landscape.
  531. % \cite{qi2019deep} proposes to vote 3D objects to their centroid for object detection in cloud points.
  532. %-----------------------------------------------------------------------------------%
  533. \section{Approach}\label{sec:dht}
  534. %-----------------------------------------------------------------------------------%
  535. In this section, we give the details of the proposed deep Hough transform for semantic line
  536. detection.
  537. %
  538. Our proposed method mainly contains four components:
  539. 1) a CNN encoder that extracts pixel-wise deep representations;
  540. 2) the deep Hough transform (DHT) that converts the deep representations from the spatial domain to the parametric domain;
  541. 3) a line detector that is responsible for detecting lines in the parametric space, and
  542. 4) a reverse Hough transform (RHT) component that converts the detected lines back to image space.
  543. %
  544. All these components are integrated in an end-to-end framework that performs forward inference and backward training within a single step.
  545. \revise{The pipeline is illustrated in~\cref{fig:pipeline}, and the detailed architecture is shown in the supplementary materials.}
  546. \CheckRmv{
  547. \begin{figure*}[tb!]
  548. \centering
  549. \begin{overpic}[scale=0.52]{figures/pipeline.pdf}
  550. \put(31, 28){$X$}
  551. \put(54, 28){$Y$}
  552. \put(40.2, 13.9){$\mathcal{H}$}
  553. \put(40.2, 17.65){$\mathcal{H}$}
  554. \put(40.2, 21.45){$\mathcal{H}$}
  555. \put(40.2, 25.35){$\mathcal{H}$}
  556. \put(83.65, 11.5){$\mathcal{R}$}
  557. \put(41.7, 1){$\mathcal{H}$}
  558. \put(55.3, 1){$\mathcal{R}$}
  559. \put(67.15, 1.2){\scriptsize CTX}
  560. \put(64.1, 14.2){\scriptsize CTX}
  561. \put(64.1, 18.4){\scriptsize CTX}
  562. \put(64.1, 21.5){\scriptsize CTX}
  563. \put(64.1, 25){\scriptsize CTX}
  564. \put(88.6, 19){\scriptsize Loss}
  565. \put(85, 27){\small Training only}
  566. \put(93.5, 7.5){\small Testing}
  567. \put(94.5, 5){\small only}
  568. \put(5, 0.8){Upsample + Concat}
  569. \put(30, 0.8){Add}
  570. \put(47, 0.8){DHT}
  571. \put(60, 0.8){RHT}
  572. \put(72, 0.8){CTX}
  573. \end{overpic}
  574. % \vspace{-8pt}
  575. \caption{
  576. \revise{The pipeline of our proposed method. DHT is short for the proposed Deep Hough Transform,
  577. and RHT represents the Reverse Hough Transform. CTX means the context-aware line detector which contains
  578. multiple convolutional layers.}
  579. }\label{fig:pipeline}
  580. \end{figure*}
  581. }
  582. %-----------------------------------------------------------------------------------%
  583. \subsection{Line Parameterization and Reverse} \label{sec:preliminary}
  584. \CheckRmv{
  585. \begin{figure}[!htb]
  586. \centering
  587. \begin{overpic}[height=.6\linewidth]{figures/line_param.pdf}
  588. \put(29, 29){$r_l$}
  589. \put(57, 55){$\theta_l$}
  590. \put(100, 50){$x$}
  591. \put(52, 100){$y$}
  592. \end{overpic}
  593. \caption{A line can be parameterized by bias $r_l$ and slope $\theta_l$.
  594. }\label{fig:param}
  595. \end{figure}
  596. }
  597. %
  598. As shown in~\cref{fig:param},
  599. given a 2D image \revise{$I_{H\times W}\in\mathbb{R}^{H\times W}$},
  600. we set the origin to the center of the image.
  601. %
  602. In the 2D plane, a straight line $l$ can be parameterized by two parameters: an orientation parameter $\theta_l\in [0, \pi)$ representing the angle between $l$ and the x-axis and a distance parameter $r_l$, indicating the distance between $l$ and the origin.
  603. %
  604. Obviously $\forall \ l \in I, r_l \in [-\sqrt{W^2+H^2}/2, \sqrt{W^2+H^2}/2]$.
  605. Given any line $l$ on $I$, we can parameterize it with the above formulations,
  606. and also we can perform a reverse mapping to translate any valid
  607. ($r, \theta$) pair to a line instance.
  608. %
  609. \revise{We define the line-to-parameters and the inverse mapping
  610. as:
  611. \begin{equation}
  612. \begin{split}
  613. r_l, \theta_l &\leftarrow P(l), \\
  614. l &\leftarrow P^{-1}(r_l, \theta_l).
  615. \label{eq:parameterize}
  616. \end{split}
  617. \end{equation}
  618. Obviously, both $P$ and $P^{-1}$ are bijective mappings.}
  619. %
  620. In practice, $r$ and $\theta$ are quantized to discrete bins to be processed by computer programs.
  621. %
  622. Suppose the quantization interval for $r$ and $\theta$ are $\Delta r$ and $\Delta \theta$, respectively.
  623. %
  624. Then the quantization can be formulated as below:
  625. \begin{equation}
  626. \begin{split}
  627. \hat{r}_l = \ceil*{\frac{r_l}{\Delta r}}, \
  628. \hat{\theta}_l = \ceil*{\frac{\theta_l}{\Delta \theta}},
  629. \end{split}\label{eq:quantization}
  630. \end{equation}
  631. where $\hat{r}_l$ and $\hat{\theta_l}$ are the quantized line parameters.
  632. %
  633. The number of quantization levels, denoted with $\Theta$ and $R$, are:
  634. \begin{equation}
  635. \begin{split}
  636. \Theta = \frac{\pi}{\Delta \theta}, \
  637. R = \frac{\sqrt{W^2+H^2}}{\Delta r},
  638. \end{split}\label{eq:grid-size}
  639. \end{equation}
  640. as shown in~\cref{fig:DHT}.
  641. %-----------------------------------------------------------------------------------%
  642. \subsection{Feature Transformation with Deep Hough Transform} \label{sec:dht-dht}
  643. \CheckRmv{
  644. \begin{figure*}[tb]
  645. \centering
  646. \hfill
  647. \subfigure[]{
  648. \label{fig:DHT}
  649. \begin{overpic}[height=.18\linewidth]{figures/dht.pdf}
  650. % \put(70.5, 11.5){$\mathcal{T}$}
  651. \put(-5, 19){$H$}
  652. \put(15, 33.3){$W$}
  653. %
  654. % \put(92, 16){$\Theta$}
  655. \put(55, 19){$\Theta$}
  656. \put(75, 33.3){$R$}
  657. %
  658. \put(20, 43){$\mathbf{X}$}
  659. \put(79, 43){$\mathbf{Y}$}
  660. \put(76, 8){$(\hat{\theta}_l, \hat{r}_l)$}
  661. \end{overpic}
  662. }\hfill
  663. \subfigure[]{
  664. \label{fig:nonlocal}
  665. \begin{overpic}[width=0.46\linewidth]{figures/nonlocal.pdf}
  666. \put(19, 39){$W$}
  667. \put(-6, 20){$H$}
  668. \put(80, 39){$\Theta$}
  669. \put(102, 20){$R$}
  670. \put(45, 24){DHT}
  671. \put(45, 11){RHT}
  672. \put(6, -4){feature space}
  673. \put(65, -4){parametric space}
  674. \end{overpic}
  675. }\hfill
  676. \caption{
  677. (a): Features along a line in the feature space (blue, left) are
  678. accumulated to a point $(\hat{r}_l,\hat{\theta}_l)$ in the parametric space (red, right).
  679. (b): Illustration of the proposed context-aware feature aggregation.
  680. %
  681. Features of nearby lines in the feature space (left)
  682. are translated into neighbor points in the parametric space (right).
  683. %
  684. In the parametric space, a simple $3\times 3$ convolutional operation can
  685. easily capture contextual information for the central line (orange).
  686. %
  687. Best viewed in color.
  688. }
  689. \end{figure*}
  690. }
  691. %-----------------------------------------------------------------------------------%
  692. \subsubsection{Deep Hough transform.}
  693. Given an input image $I$, we first extract deep CNN features
  694. $\mathbf{X} \in \mathbb{R} ^ {C \times H \times W}$ with the encoder network,
  695. where $C$ indicates the number of channels and $H$ and $W$ are the spatial size.
  696. %
  697. Afterward, the deep Hough transform (DHT) takes $\mathbf{X}$ as input and produces
  698. the transformed features, $\mathbf{Y} \in \mathbb{R} ^ {C \times \Theta \times R}$.
  699. %
  700. The size of transformed features, $\Theta, R$, is determined by the quantization intervals, as
  701. described in~\cref{eq:grid-size}.
  702. As shown in~\cref{fig:DHT},
  703. given an arbitrary line $l$ on the image,
  704. we aggregate features of all pixels along $l$,
  705. to $(\hat{\theta}_l, \hat{r}_l)$ in the parametric space $Y$:
  706. \begin{equation}
  707. \mathbf{Y}(\hat{\theta}_l, \hat{r}_l) = \sum_{i\in l} \mathbf{X}(i),
  708. \label{eq:dht}
  709. \end{equation}
  710. where $i$ is the positional index.
  711. %
  712. $\hat{\theta}_l$ and $\hat{r}_l$ are determined by the parameters of line $l$, according
  713. to~\cref{eq:parameterize}, and then quantized into discrete grids, according to~\cref{eq:quantization}.
  714. \revise{Given the number of quantization levels $\Theta$ and $R$,
  715. we have $\Theta\cdot R$ unique line candidates.
  716. %
  717. Then the DHT is applied to all these candidate lines and their respective features
  718. are aggregated to the corresponding position in $\mathbf{Y}$.}
  719. %
  720. % These lines are obtained by connecting arbitrary pairs of pixels on the edges
  721. % of an image, and then excluding the duplicated lines.
  722. %
  723. It is worth noting that DHT is order-agnostic in both the feature space and the parametric space,
  724. making it highly parallelizable.
  725. %-----------------------------------------------------------------------------------%
  726. \subsubsection{Multi-scale DHT with FPN.}\label{sec:ms-dht-fpn}
  727. Our proposed DHT could be easily applied to arbitrary spatial features.
  728. %
  729. We use the feature pyramid network (FPN)~\cite{lin2017feature} as our encoder.
  730. %
  731. FPN can help to extract multi-scale and rich semantic features.
  732. Specifically, the FPN outputs 4 feature maps $X_1, X_2, X_3, X_4$ and their respective
  733. resolutions are $1/4$, $1/8$, $1/16$, $1/16$ of the input resolution.
  734. %
  735. Then each feature map is transformed by a DHT module independently, as shown in~\cref{fig:pipeline}.
  736. %
  737. Since these feature maps are in different resolutions, the transformed features
  738. $Y_1, Y_2, Y_3, Y_4$ also have different sizes, because we use the same quantization
  739. interval in all stages (see~\cref{eq:grid-size} for details).
  740. %
  741. To fuse transformed features together, we interpolate $Y_2, Y_3, Y_4$
  742. to the size of $Y_1$, and then fuse them by concatenation.
  743. %-----------------------------------------------------------------------------------%
  744. \subsection{Line Detection in the Parametric Space}
  745. %-----------------------------------------------------------------------------------%
  746. \subsubsection{Context-aware line detector.}\label{sec:ctx-line-detector}
  747. % Our detector takes DHT-transformed features as input, and outputs a $\Theta\times R$
  748. % probability map.
  749. After the deep Hough transform (DHT), features are translated to the parametric space
  750. where grid location $(\theta, r)$ corresponds to
  751. features along an entire line $l=P^{-1}(\theta, r)$ in the feature space.
  752. %
  753. An important reason to transform the features into the parametric space
  754. is that the line structures could
  755. be more compactly represented.
  756. %
  757. As shown in~\cref{fig:nonlocal},
  758. lines nearby a specific line $l$ are translated to
  759. surrounding points near $(\theta_l, r_l)$.
  760. %
  761. Consequently, features of nearby lines can be efficiently aggregated
  762. using convolutional layers in the parametric space.
  763. In each stage of the FPN,
  764. we use two $3\times 3$ convolutional layers
  765. to aggregate contextual line features.
  766. %
  767. Then we interpolate features
  768. to match the resolution of features from different stages, as illustrated in
  769. ~\cref{fig:pipeline},
  770. and concatenate the interpolated features together.
  771. %
  772. Finally, a $1\times 1$ convolutional layer is applied to the concatenated feature maps
  773. to produce pointwise predictions.
  774. %-----------------------------------------------------------------------------------%
  775. \subsubsection{Loss function.}\label{sec:loss-func}
  776. Since the prediction is directly produced in the parametric space,
  777. we calculate the loss in the same space as well.
  778. %
  779. For a training image $I$, the ground-truth lines are first converted into
  780. the parametric space with the standard Hough transform.
  781. %
  782. Then to help converging faster, we smooth and expand the ground-truth with a
  783. Gaussian kernel.
  784. %
  785. Similar tricks have been used in many other tasks like
  786. crowed counting~\cite{liu2019context,cheng2019learning} and road segmentation~\cite{VecRoad_20CVPR}.
  787. %
  788. Formally, let $\mathbf{G}$ be the binary ground-truth map in the parametric space,
  789. $\mathbf{G}_{i,j} = 1$ indicates there is a line located at $i,j$ in the parametric space.
  790. %
  791. The expanded ground-truth map is
  792. $$\hat{\mathbf{G}} = \mathbf{G}\circledast K,$$
  793. where $K$ is a $5\times 5$ Gaussian kernel and $\circledast$ denotes the convolution operation.
  794. %
  795. An example pair of smoothed ground-truth and the predicted map is shown
  796. in~\cref{fig:pipeline}.
  797. In the end, we compute the cross-entropy between the smoothed ground-truth
  798. and the predicted map in the parametric space:
  799. \begin{equation}
  800. L = -\sum_i \Big\{ \hat{\mathbf{G}}_i\cdot\log(\mathbf{P}_i) +
  801. (1-\hat{\mathbf{G}}_i)\cdot\log(1-\mathbf{P}_i)
  802. \Big\}
  803. \end{equation}
  804. %-----------------------------------------------------------------------------------%
  805. \subsection{Reverse Mapping}\label{sec:reverse}
  806. Our detector produces predictions in the parametric space representing
  807. the probability of the existence of lines.
  808. %
  809. The predicted map is then binarized with a threshold (\eg 0.01).
  810. %
  811. Then we find each connected area and calculate respective centroids.
  812. %
  813. These centroids are regarded as the parameters of detected lines.
  814. %
  815. At last, all lines are mapped back to the image space with
  816. $P^{-1}(\cdot)$, as formulated in~\cref{eq:parameterize}.
  817. %
  818. We refer to the ``mapping back'' step as ``Reverse Mapping of Hough Transform (RHT)'',
  819. as shown in~\cref{fig:pipeline}.
  820. %-----------------------------------------------------------------------------------%
  821. \subsection{Edge-guided Line Refinement}\label{sec:refine}
  822. Semantic lines are outstanding structures that
  823. separate different regions in a scene.
  824. %
  825. Therefore, edges may serve as indicators for semantic lines.
  826. %
  827. We propose to refine the detection results by aligning line positions using
  828. edge information.
  829. %
  830. First, we compute an edge map $E$ using HED~\cite{xie2015holistically}.
  831. %
  832. Afterward, given a detected line $l$, the edge density of $l$ is defined as the average
  833. edge response along $l$:
  834. \begin{equation}
  835. \rho(l) = \frac{\sum_{i\in l} E_i}{|l|},
  836. \label{eq:rho}
  837. \end{equation}
  838. where $|l|$ is the number of pixels on $l$.
  839. %
  840. For the sake of stability, we widen $l$ by 1 pixel on both sides (totally the width is 3)
  841. when dealing with~\cref{eq:rho}.
  842. Let $\mathcal{L}$ be a set of lines that are close to $l$.
  843. %
  844. These lines are obtained by moving the end-points of $l$ by $\delta_r$ pixels clockwise
  845. and anti-clockwise.
  846. %
  847. Since there are two end-points and each one has
  848. $\delta_r+1$ possible locations, the size of the set is $||\mathcal{L}|| = (\delta_r+1)^2$.
  849. %
  850. Then the refinement can be achieved by finding the optimal line $l^*$ from $\mathcal{L}$
  851. that holds the highest edge density:
  852. \begin{equation}
  853. l^* = \argmax_{l\in\mathcal{L}} \ \rho(l).\label{eq:refine-search}
  854. \end{equation}
  855. The performance of ``edge-guided line refinement'' with different $\delta_r$
  856. is recorded in ~\cref{sec:ablation-refinement}.
  857. % \cref{fig:edge} shows the processing of the refinement.
  858. % \begin{figure}[!htb]
  859. % \centering
  860. % \begin{overpic}[width=1.0\linewidth]{figures/edge_align.pdf}
  861. % \put(13, -5){(a)}
  862. % \put(48, -5){(b)}
  863. % \put(81, -5){(c)}
  864. % \end{overpic}
  865. % \vspace{4pt}
  866. % \caption{
  867. % Example of refining a line with edge information.
  868. % (a): The origin detected line (yellow).
  869. % (b): Density of detected line (yellow area) and a nearby line (green).
  870. % (c): The detected line is replaced by the nearby line (green) which has a larger density.
  871. % }
  872. % \label{fig:edge}
  873. % \end{figure}
  874. \CheckRmv{
  875. \begin{figure*}[tb!]
  876. \centering
  877. \subfigure[]{
  878. \begin{overpic}[width=.19\linewidth]{figures/metric_analysis_1.pdf}
  879. \put(54,66){m}
  880. \put(74,32){n}
  881. \put(10,50){p}
  882. \put(7,25){q}
  883. \put(19,10){{IOU(m,n)=0.66}}
  884. \put(20,0){{IOU(p,q)=0.95}}
  885. \end{overpic}\label{fig:metric_a}
  886. }
  887. \hfill
  888. \subfigure[]{
  889. \begin{overpic}[width=.19\linewidth]{figures/metric_analysis_2.pdf}
  890. \put(18,10){{IOU(red)=0.53}}
  891. \put(16.5,0){{IOU(blue)=0.46}}
  892. \end{overpic}\label{fig:metric_b}
  893. }
  894. \hfill
  895. \subfigure[]{
  896. \begin{overpic}[width=.19\linewidth]{figures/metric_analysis_3.pdf}
  897. \put(54,66){m}
  898. \put(74,32){n}
  899. \put(10,50){p}
  900. \put(7,25){q}
  901. %
  902. \put(22,10){{$\mathcal{S}$(m,n)=0.24}}
  903. \put(23,0){{$\mathcal{S}$(p,q)=0.34}}
  904. \end{overpic}\label{fig:metric_c}
  905. }
  906. \hfill
  907. \subfigure[]{
  908. \begin{overpic}[width=.19\linewidth]{figures/metric_analysis_4.pdf}
  909. \put(16,10){{$\mathcal{S}_d$=0.54, $\mathcal{S}_\theta$=0.02}}
  910. \put(28.5,0){{$\mathcal{S}$=0.00}}
  911. \end{overpic}\label{fig:metric_d}
  912. }\vspace{-10pt}
  913. \caption{
  914. (a): Two pairs of lines with similar relative position could have very
  915. different IOU scores.
  916. %
  917. (b): Even humans cannot determine which area (blue or red) should be considered as
  918. the intersection in the IOU-based metric~\cite{lee2017semantic}.
  919. %
  920. (c) and (d): Our proposed metric considers both Euclidean distance and angular distance
  921. between a pair of lines, resulting in consistent and reasonable scores.
  922. %
  923. Best viewed in color.
  924. }
  925. \label{fig:metric_analysis}
  926. \end{figure*}
  927. }
  928. \CheckRmv{
  929. \begin{figure*}[t!]
  930. \centering
  931. \begin{overpic}[width=1.0\linewidth]{figures/sl_scores.pdf}
  932. \put(6, -1.5){$\mathcal{S}=0.1$}
  933. \put(26.5, -1.5){$\mathcal{S}=0.3$}
  934. \put(46.5, -1.5){$\mathcal{S}=0.5$}
  935. \put(66.5, -1.5){$\mathcal{S}=0.7$}
  936. \put(86.5, -1.5){$\mathcal{S}=0.9$}
  937. % \put(7, 0.5){$\mathcal{S}=0.8$}
  938. % \put(31.5, 0.5){$\mathcal{S}=0.85$}
  939. % \put(57.5, 0.5){$\mathcal{S}=0.9$}
  940. % \put(82, 0.5){$\mathcal{S}=0.95$}
  941. \end{overpic}
  942. \caption{
  943. Example lines with various EA-scores ($\mathcal{S}$ in ~\cref{eq:ea-score}).
  944. \revise{The larger the EA-score is, the more similar the lines are.}
  945. }\label{fig:metric_show}
  946. \end{figure*}
  947. }
  948. %-----------------------------------------------------------------------------------%
  949. \section{The Proposed Evaluation Metric}
  950. \label{sec:metric}
  951. %-----------------------------------------------------------------------------------%
  952. In this section, we elaborate on the
  953. proposed evaluation metric that measures the agreement, or alternatively,
  954. the similarity between the two lines in an image.
  955. %
  956. Firstly, we review several widely used metrics in the computer vision community
  957. and then explain why these existing metrics are not proper for our task.
  958. %
  959. Finally, we introduce our newly proposed metric, which measures the agreement between two lines considering
  960. both Euclidean distance and angular distance.
  961. %
  962. %-----------------------------------------------------------------------------------%
  963. \subsection{Review of Existing Metrics}
  964. The intersection over union (IOU) is widely used in object detection, semantic segmentation
  965. and many other tasks to measure the agreement between detected bounding boxes
  966. (segments) w.r.t the ground-truth.
  967. %
  968. Lee \etal \cite{lee2017semantic} adopt the original IOU into line detection,
  969. and propose the line-based IOU to evaluate the quality of detected lines.
  970. %
  971. Concretely, the similarity between the two lines is measured by the intersection areas
  972. of lines divided by the image area.
  973. %
  974. Take~\cref{fig:metric_a} as an example, the similarity between line $m$ and $n$
  975. is $\text{IOU}(m,n) = area({{red}})/area(I)$.
  976. However, we argue that this IOU-based metric is improper and may lead to unreasonable
  977. or ambiguous results under specific circumstances.
  978. %
  979. As illustrated in~\cref{fig:metric_a},
  980. two pairs of lines (m, n, and p, q) with similar structures could have
  981. very different IOU scores.
  982. %
  983. In~\cref{fig:metric_b}, even humans cannot determine which areas
  984. ({red} or {blue}) should be used as
  985. intersection areas in line based IOU.
  986. \rerevise{There are other metrics,
  987. \eg the Earth Mover's Distance (EMD)~\cite{rubner2000earth}
  988. and the Chamfer distance (CD)~\cite{borgefors1986distance},
  989. that can be used to measure line similarities.
  990. %
  991. However, these metrics require to rasterize the lines into pixels
  992. and then calculate pixel-wise distances, which is less efficient.
  993. }
  994. \rerevise{
  995. To remedy the deficiencies,
  996. we propose a simple yet effective metric that measures the similarity
  997. of two lines in the parametric space.
  998. %
  999. Our proposed metric is much more efficient than EMD and CD.
  1000. %
  1001. Quantitative comparisons in ~\cref{sec:quantitative} demonstrate that
  1002. our proposed metric presents very similar results to EMD and CD.}
  1003. \CheckRmv{
  1004. \begin{figure*}[!htb]
  1005. \centering
  1006. \begin{overpic}[width=1\linewidth]{figures/dataset_vis.jpg}
  1007. \end{overpic}\vspace{-8pt}
  1008. \caption{
  1009. Example images and annotations (yellow lines) of \revise{NKL}.
  1010. %
  1011. Images of \revise{NKL} present diverse scenes and rich line annotations.
  1012. }
  1013. \label{fig:dataset_vis}
  1014. \end{figure*}
  1015. }
  1016. %-----------------------------------------------------------------------------------%
  1017. \subsection{The Proposed Metric}
  1018. Our proposed metric, \revise{termed \textbf{EA-score}}, considers both
  1019. \textbf{E}uclidean distance and \textbf{A}ngular distance
  1020. between a pair of lines.
  1021. %
  1022. Let $l_i, l_j$ be a pair of lines to be measured,
  1023. the angular distance $\mathcal{S}_\theta$ is defined according to the
  1024. angle between two lines:
  1025. \begin{equation}
  1026. \mathcal{S}_\theta = 1 - \frac{\theta(l_i, l_j)}{\pi/2},
  1027. \label{eq:sa}
  1028. \end{equation}
  1029. where $\theta(l_i, l_j)$ is the angle between $l_i$ and $l_j$.
  1030. %
  1031. The Euclidean distance is defined as:
  1032. %
  1033. \begin{equation}
  1034. \mathcal{S}_d = 1 - D(l_i, l_j),
  1035. \label{eq:sd}
  1036. \end{equation}
  1037. where $D(l_i, l_j)$ is the Euclidean distance between midpoints of $l_i$ and $l_j$.
  1038. %
  1039. % Examples of $\mathcal{S}_\theta$ and $\mathcal{S}_d$ can be found in ~\cref{fig:metric_c} and ~\cref{fig:metric_d}.
  1040. %
  1041. Note that we normalize the image into a unit square before calculating $D(l_i, l_j)$.
  1042. %
  1043. Examples of $\mathcal{S}_d$ and $\mathcal{S}_\theta$ can be found in
  1044. ~\cref{fig:metric_c} and~\cref{fig:metric_d}.
  1045. %
  1046. Finally, our proposed EA-score is:
  1047. \begin{equation}
  1048. \mathcal{S} = (\mathcal{S}_\theta \cdot \mathcal{S}_d)^2.
  1049. \label{eq:ea-score}
  1050. \end{equation}
  1051. ~\cref{eq:ea-score} is squared to make it more sensitive and discriminative
  1052. when the values are high.
  1053. Several example line pairs and corresponding EA-scores
  1054. are demonstrated in~\cref{fig:metric_show}.
  1055. %-----------------------------------------------------------------------------------%
  1056. \section{\revise{NKL: a semantic line detection dataset}}\label{sec:nkl-dataset}
  1057. %-----------------------------------------------------------------------------------%
  1058. To the best of our knowledge, there is only one dataset, SEL~\cite{lee2017semantic},
  1059. specifically for semantic line detection.
  1060. %
  1061. SEL contains 1,715 images of which 175 images for testing
  1062. and others for training.
  1063. %
  1064. To fulfill the gap between large CNN-based models and \revise{the} scale of the existing
  1065. dataset,
  1066. we collect a new dataset for semantic line detection.
  1067. The new dataset, \revise{namely NKL (short for \textbf{N}an\textbf{K}ai \textbf{L}ines), contains 6,500 images} that present richer
  1068. diversity in terms of both scenes and the number of lines.
  1069. %
  1070. Each image of \revise{NKL} is annotated by multiple skilled human annotators to
  1071. ensure the annotation quality.
  1072. %
  1073. The dataset is open available on our project page.
  1074. \CheckRmv{
  1075. \begin{table}[hbt!]
  1076. \renewcommand{\arraystretch}{1.3}
  1077. \renewcommand\tabcolsep{5pt}
  1078. \caption{Number of images and lines in SEL~\cite{lee2017semantic} and \revise{NKL}.}
  1079. \vspace{-5pt}
  1080. \begin{tabular}{c|c|c|c}
  1081. \toprule
  1082. Dataset &
  1083. \makecell{Total \\ \#images, \#lines} &
  1084. \makecell{Training \\ \#images, \#lines} &
  1085. \makecell{Evaluation \\ \#images, \#lines} \\
  1086. \hline
  1087. SEL~\cite{lee2017semantic} & 1,715, \ 2,791 & 1,541, \ 2,493 & 175, \ 298\\
  1088. \revise{NKL (Ours)} & \revise{6,500}, \ \revise{13,148} & \revise{5,200}, \ \revise{10,498} & \revise{1,300, \ 2,650} \\
  1089. \bottomrule
  1090. \end{tabular}
  1091. \label{tab:number_statistics}
  1092. \end{table}
  1093. }
  1094. % The new dataset is openly available at ~\url{http://kaizhao.net/sl5k}.
  1095. \subsection{Data Collection and Annotation}\label{sec:data-collect-and-anno}
  1096. All the images of \revise{NKL} are crawled from the internet
  1097. % pixabay\footnote{\url{https://pixabay.com}}
  1098. using specific keywords such as sea, grassland \etal
  1099. %
  1100. After copyright checking, we carefully filter out images with
  1101. at least one semantic line.
  1102. %
  1103. Since the annotation of semantic lines is subjective and depends on annotators,
  1104. each image is first annotated by 3 knowledgeable human annotators
  1105. and verified by others.
  1106. %
  1107. A line is regarded as positive only if all of the 3 annotators are consistent.
  1108. %
  1109. Then the inconsistent lines are reviewed by two other annotators.
  1110. %
  1111. In a word, for each line, there are at least 3 and at most 5 annotators,
  1112. and a line is regarded as positive only if the line is marked as positive by more than 3 annotators.
  1113. \subsection{Dataset Statistics}
  1114. \subsubsection{Number of images and semantic lines}
  1115. % There are 6,500 images in NKL dataset.
  1116. %
  1117. % Images in NKL are randomly split into training and evaluation sets that contain
  1118. % 5,200 and 1,300 images, respectively.
  1119. %
  1120. There are totally \revise{13,148 semantic lines in NKL} and 2,791 semantic lines in
  1121. SEL~\cite{lee2017semantic} dataset over all images.
  1122. %
  1123. \cref{tab:number_statistics} summarizes the number of images and lines of the two datasets, respectively.
  1124. ~\cref{fig:number_statistics} summarizes the histogram of the per-image number of lines
  1125. in \revise{NKL} and SEL~\cite{lee2017semantic} datasets.
  1126. %
  1127. More than half \revise{(67\%, 4,356/6,500)} of the images in \revise{NKL} dataset contain more than 1 semantic line,
  1128. while the percentage of SEL is only 45.5\%.
  1129. \CheckRmv{
  1130. \begin{figure}[!htb]
  1131. \centering
  1132. \begin{overpic}[width=1\linewidth]{figures/line_numbers.pdf}
  1133. \put(15, 26){\rotatebox{30}{\scriptsize 33.0\%}}
  1134. \put(19, 41){\rotatebox{30}{\scriptsize 54.5\%}}
  1135. \put(26, 31){\rotatebox{29}{\scriptsize 40.2\%}}
  1136. \put(30, 25.5){\rotatebox{30}{\scriptsize 31.5\%}}
  1137. \put(37, 17.3){\rotatebox{30}{\scriptsize 19.4\%}}
  1138. \put(40, 11.5){\rotatebox{30}{\scriptsize 11.3\%}}
  1139. \put(47, 8.6){\rotatebox{30}{\scriptsize 6.7\%}}
  1140. \put(51, 5){\rotatebox{30}{\scriptsize 2.1\%}}
  1141. \put(58.5, 5.5){\rotatebox{30}{\scriptsize 0.54\%}}
  1142. \put(62.2, 4.5){\rotatebox{30}{\scriptsize 0.41\%}}
  1143. \put(69, 4.8){\rotatebox{30}{\scriptsize 0.01\%}}
  1144. \put(73, 4.2){\rotatebox{30}{\scriptsize 0.01\%}}
  1145. \put(80, 5){\rotatebox{30}{\scriptsize 0.01\%}}
  1146. \put(83, 4){\rotatebox{30}{\scriptsize 0}}
  1147. \put(90.5, 5){\rotatebox{30}{\scriptsize 0.003\%}}
  1148. \put(93.5, 4){\rotatebox{30}{\scriptsize 0}}
  1149. \put(78, 39.6){\scriptsize NKL (Ours)}
  1150. \put(78, 34.4){\scriptsize SEL~\cite{lee2017semantic}}
  1151. \end{overpic}
  1152. \caption{
  1153. \revise{Histogram chart of number of lines.
  1154. %
  1155. Lines of our dataset are more fairly distributed compared to
  1156. SEL.}
  1157. }
  1158. \label{fig:number_statistics}
  1159. \end{figure}
  1160. }
  1161. \subsubsection{Diversity Analysis}
  1162. To analyze the diversity of SEL and \revise{NKL} datasets,
  1163. we feed all the images into a ResNet50~\cite{he2016deep} network that is pretrained on the Place365~\cite{zhou2017places},
  1164. and then collect the outputs as category labels.
  1165. %
  1166. The results are presented in ~\cref{fig:dataset_distribution}.
  1167. There are totally 365 categories in Place365~\cite{zhou2017places} dataset,
  1168. among which we got 167 unique category labels on SEL dataset and \revise{327 on NKL}.
  1169. %
  1170. Besides, as shown in ~\cref{fig:dataset_distribution}, scene labels on
  1171. \revise{NKL} dataset are more fairly distributed compared to SEL dataset.
  1172. %
  1173. For example, in SEL dataset, top-3 populated categories (sky, field, desert) make up more
  1174. than a quarter of the total.
  1175. %
  1176. While in \revise{NKL}, top-3 makes up less than one-fifth of the total.
  1177. % As shown in~\cref{fig:number_statistics},
  1178. % More than half images in SEL dataset contain only one semantic line.
  1179. % %
  1180. % About 38\% images in NKL-pre (ours) dataset contain one semantic line, 42\% images contain two semantic lines and 20\% images contain three or more semantic lines.
  1181. % \todo{
  1182. % we abtain 167 categories on SEL dataset and 288 categories on NKL-pre dataset. As shown in ~\cref{fig:dataset_distribution}, the distribution on NKL-pre is more diverse
  1183. % with more categories and balanced xxx.}
  1184. \CheckRmv{
  1185. \begin{figure*}[tb]
  1186. \centering
  1187. \hfill
  1188. \subfigure[]{
  1189. \begin{overpic}[width=.48\linewidth]{figures/SEL.pdf}
  1190. \put(6, 55){SEL~\cite{lee2017semantic}}
  1191. \end{overpic}
  1192. }\hfill
  1193. \subfigure[]{
  1194. \begin{overpic}[width=0.465\linewidth]{figures/NKL.pdf}
  1195. \put(5, 57){NKL}
  1196. \end{overpic}
  1197. }\hfill
  1198. \vspace{-10pt}
  1199. \caption{
  1200. \revise{
  1201. Category distribution of SEL (a) and NKL (b) datasets.
  1202. %
  1203. Category labels are obtained through a Places365 pretrained model.
  1204. %
  1205. There are 327 (totally 365) scene labels presented in NKL dataset,
  1206. in contrast to 167 in SEL dataset.
  1207. %
  1208. The labels of NKL are also more fairly distributed compared to
  1209. that of SEL.}
  1210. }
  1211. \label{fig:dataset_distribution}
  1212. \end{figure*}
  1213. }
  1214. %-----------------------------------------------------------------------------------%
  1215. %-----------------------------------------------------------------------------------%
  1216. \section{Experiments}\label{sec:experiments}
  1217. %-----------------------------------------------------------------------------------%
  1218. In this section, we introduce the implementation details of our system,
  1219. and report experimental results compared with existing methods.
  1220. %-----------------------------------------------------------------------------------%
  1221. \subsection{Implementation Details} \label{sec:protocol}
  1222. %
  1223. Our system is implemented with the PyTorch~\cite{paszke2019pytorch} framework,
  1224. and a Jittor~\cite{hu2020jittor} implementation is also available.
  1225. %
  1226. Since the proposed deep Hough transform (DHT) is highly parallelizable,
  1227. we implement DHT with native CUDA programming,
  1228. and all other parts are implemented based on framework level Python API.
  1229. We use a single RTX 2080 Ti GPU for all experiments.
  1230. %-----------------------------------------------------------------------------------%
  1231. \subsubsection{Network architectures.}
  1232. We use two representative network architectures,
  1233. ResNet50~\cite{he2016deep} and VGGNet16~\cite{simonyan2014very}, as our backbone
  1234. and the FPN~\cite{lin2017feature} to extract multi-scale deep representations.
  1235. %
  1236. For the ResNet network, following the common practice in previous works~\cite{zhao2017pyramid,chen2018encoder,pami20Res2net},
  1237. the dilated convolution~\cite{yu2015multi} is used in the last layer to increase the resolution of feature maps. The common used batch normalization~\cite{ioffe2015batch} is also adopted in the network.
  1238. The dilated convolutions and normalization can be tuned by~\cite{gao2021rbn,gao2021global2local} in the future work.
  1239. %-----------------------------------------------------------------------------------%
  1240. \subsubsection{Hyper-parameters.}
  1241. The size of the Gaussian kernel used in~\cref{sec:loss-func} is $5\times5$.
  1242. %
  1243. All images are resized to (400, 400) and then wrapped into a mini-batch of 8.
  1244. %
  1245. We train all models for 30 epochs using Adam optimizer~\cite{kingma2014adam} without weight decay.
  1246. %
  1247. The learning rate and momentum are set to $2 \times 10^{-4}$ and 0.9, respectively.
  1248. %
  1249. The quantization intervals $\Delta\theta, \Delta r$ will be detailed in
  1250. ~\cref{sec:quant-intervals} and~\cref{eq:intervals}.
  1251. %-----------------------------------------------------------------------------------%
  1252. \CheckRmv{
  1253. \begin{figure*}[!htb]
  1254. \centering
  1255. \begin{overpic}[width=1\linewidth]{figures/matching.pdf}
  1256. \put(-0.5,10){{\color{BlueGreen}{$p_1$}}}
  1257. \put(-0.5,8){{\color{BlueGreen}{$p_2$}}}
  1258. \put(-0.5,6){{\color{BlueGreen}{$p_3$}}}
  1259. \put(-0.5,4.2){{\color{BlueGreen}{$p_4$}}}
  1260. %
  1261. \put(15.5,11){{\color{orange}{$g_1$}}}
  1262. \put(15.5,7.2){{\color{orange}{$g_2$}}}
  1263. \put(15.5,4.5){{\color{orange}{$g_3$}}}
  1264. %
  1265. \put(26,1.5){$g_1$}
  1266. \put(31.5,1.5){$g_2$}
  1267. \put(37.8,1.5){$g_3$}
  1268. \put(22.5,15.2){$p_1$}
  1269. \put(29,15.2){$p_2$}
  1270. \put(35,15.2){$p_3$}
  1271. \put(41.3,15.2){$p_4$}
  1272. %
  1273. \put(54,1.5){$g_1$}
  1274. \put(61,1.5){$g_2$}
  1275. \put(67,1.5){$g_3$}
  1276. \put(51.5,15.2){$p_1$}
  1277. \put(57.3,15.2){$p_2$}
  1278. \put(63.3,15.2){$p_3$}
  1279. \put(70,15.2){$p_4$}
  1280. %
  1281. \put(95.1,1.5){\scriptsize FN}
  1282. \put(79,15.2){\scriptsize TP}
  1283. \put(86,15.2){\scriptsize TP}
  1284. \put(92,15.2){\scriptsize FP}
  1285. \put(98,15.2){\scriptsize FP}
  1286. %
  1287. \put(20,8){$\rightarrow$}
  1288. \put(47,8){$\rightarrow$}
  1289. \put(75,8){$\rightarrow$}
  1290. %
  1291. \put(8,-1){(a)}
  1292. \put(32,-1){(b)}
  1293. \put(60,-1){(c)}
  1294. \put(87.6,-1){(d)}
  1295. \end{overpic}
  1296. \caption{
  1297. Illustration of the bipartite graph matching in evaluation.
  1298. %
  1299. (a) An example image with 3 ground-truth lines ($g_1, g_2, g_3$)
  1300. and 4 predictions ($p_1, p_2, p_3, p_4$).
  1301. %
  1302. (b) the corresponding bipartite graph.
  1303. %
  1304. The edge between a pair of nodes represents the similarity ($\mathcal{S}$ in ~\cref{eq:ea-score}) between lines.
  1305. %
  1306. (c) after maximum matching of a bipartite graph, each node in a subgraph
  1307. is connected with no more than 1 node from the other subgraph.
  1308. %
  1309. (d) true positive (TP), false positive (FP) and false negative (FN).
  1310. }\vspace{20pt}
  1311. \label{fig:matching}
  1312. \end{figure*}
  1313. }
  1314. % \begin{equation}
  1315. % \begin{split}
  1316. % \Theta &= 100, \\
  1317. % R &= \sqrt{\frac{W^2+H^2}{2}},
  1318. % \end{split}\label{eq:intervals}
  1319. % \end{equation}
  1320. % where $H, W$ are the size of feature maps to be transformed in DHT.
  1321. \CheckRmv{
  1322. \begin{figure*}[htb!]
  1323. \centering
  1324. \begin{overpic}[width=0.9\linewidth]{figures/search_theta_r_pami.pdf}
  1325. \put(95,1){$\times\pi$}
  1326. \put(45,1){$\times\sqrt{2}$}
  1327. \end{overpic}\vspace{-10pt}
  1328. \caption{
  1329. Left: performance under different distance quantization intervals $\Delta r$ with a fixed
  1330. angular quantization interval $\Delta\theta=\pi/100$.
  1331. %
  1332. Larger $\Delta r$ indicates less quantization levels $R$.
  1333. Right: performance under different angular quantization intervals $\Delta \theta$ with a fixed
  1334. distance quantization interval $\Delta r=\sqrt{2}$.
  1335. }\label{fig:ablation-interval}
  1336. \end{figure*}
  1337. }
  1338. \subsubsection{Datasets and data augmentation.}
  1339. \revise{Our experiments are conducted on the
  1340. SEL~\cite{lee2017semantic} dataset and our Proposed
  1341. NKL dataset.}
  1342. %
  1343. \revise{The Statistics of the two datasets are detailed in~\cref{sec:nkl-dataset}.}
  1344. % which is, to the best of our knowledge, the only dataset for semantic line detection.
  1345. %
  1346. % The SEL dataset is composed of 1715 images, 1541 images for training, and 175 for testing.
  1347. %
  1348. % There are 1.63 lines per image on average, and
  1349. % each image contains 1 line at least and 6 lines at most.
  1350. %
  1351. Following the setup in~\cite{lee2017semantic}, we use only left-right flip
  1352. data augmentation in all our experiments.
  1353. \subsection{Evaluation Protocol}
  1354. We measure the quality of detection lines in terms of \emph{precision},
  1355. \emph{recall} and \emph{F-measure}.
  1356. %
  1357. The first step is to match the detected lines and ground-truth lines.
  1358. Let $\mathcal{P}$ and $\mathcal{G}$ be the sets of predicted lines
  1359. and ground-truth lines, respectively.
  1360. %
  1361. $p_i$ and $g_j$ are individual predicted and ground-truth line.
  1362. %
  1363. We first match the lines in $\mathcal{P}$ and $\mathcal{G}$ based on bipartite matching.
  1364. %
  1365. Suppose $G = \{V, E\}$ be a bipartite graph
  1366. \footnote{\url{https://en.wikipedia.org/wiki/Bipartite_graph}}.
  1367. %
  1368. The vertice set $V$ can be divided into two disjoint and independent sets,
  1369. in our case, $\mathcal{P}$ and $\mathcal{G}$:
  1370. \begin{equation*}
  1371. \begin{split}
  1372. V = \mathcal{P} \cup \mathcal{G} \\
  1373. \mathcal{P} \cap \mathcal{G} = \emptyset.
  1374. \end{split}
  1375. \end{equation*}
  1376. \rerevise{
  1377. Each edge in $E$ denotes the similarity between a pair of lines under
  1378. a certain similarity measure.
  1379. %
  1380. Apart from the proposed EA-score, we also use two other popular metrics:
  1381. the earth mover's distance (EMD)~\cite{rubner2000earth} and the Chamfer distance (CD)~\cite{borgefors1986distance}, as described in~\cref{sec:metric}.
  1382. %
  1383. Note that we normalize both EMD and chamfer distance by their maximal possible
  1384. value to bound the value
  1385. within [0, 1] (for both EMD and Chamfer distance, the maximal distance occurs when
  1386. two lines shrinkage to two points on the opposite diagonals).}
  1387. Given the graph $G = \{V, E\}$, a matching in a Bipartite Graph is a set of the edges chosen in such a way that no two edges share
  1388. a common vertice.
  1389. %
  1390. In our task, given the set of predicted lines $\mathcal{P}$ and the set of
  1391. ground-truth lines $\mathcal{G}$, we seek to find a matching so that
  1392. each ground-truth line $g_i$ corresponds to no more than one detected line $p_j$
  1393. and vice versa.
  1394. %
  1395. This problem, maximum matching of a bipartite graph, can be easily solved using the
  1396. classical Hungarian method~\cite{kuhn1955hungarian} with polynomial time complexity.
  1397. After matching $\mathcal{P}$ and $\mathcal{G}$, we can calculate true positive
  1398. (TP), false positive (FP) and false negative (FN) accordingly.
  1399. %
  1400. As illustrated in ~\cref{fig:matching}, predicted lines ($p_1, p_2$) that are paired with
  1401. ground-truth lines ($g_2, g_1$) are considered as true positive.
  1402. %
  1403. Predicted line ($p_3$) that is not matched with any ground-truth line is
  1404. a false positive,
  1405. and ground-truth line ($g_3$) without a corresponding predicted line
  1406. is a false negative.
  1407. Finally, the \textbf{P}recision, \textbf{R}ecall, and \textbf{F}-measure are:
  1408. \begin{equation}
  1409. P = \frac{TP}{TP+FP}, \
  1410. R = \frac{TP}{TP+FN}, \
  1411. F = \frac{2PR}{P + R}.
  1412. \label{eq:pr}
  1413. \end{equation}
  1414. We apply a series thresholds $\tau = 0.01, 0.02,...,0.99$ to prediction \& ground-truth pairs.
  1415. %
  1416. Accordingly, we derive a series of precision, recall, and F-measure scores.
  1417. %
  1418. Finally, we evaluate the performance in terms of average precision, recall, and
  1419. F-measure. \rerevise{We use EMD~\cite{rubner2000earth}, CD~\cite{borgefors1986distance}, and our proposed EA metric
  1420. for quantitative comparisons .
  1421. %
  1422. In the ablation study, we only use EA metric for simplicity.}
  1423. %-----------------------------------------------------------------------------------%
  1424. \subsection{\revise{Tuning the Quantization Intervals}}\label{sec:quant-intervals}
  1425. The quantization intervals $\Delta\theta$ and $\Delta r$ in~\cref{eq:quantization}
  1426. are important factors to the performance and running efficiency.
  1427. %
  1428. Larger intervals lead to fewer quantization levels, \ie $\Theta$ and $R$,
  1429. and the model will be faster.
  1430. %
  1431. With smaller intervals, there will be more quantization levels,
  1432. and the computational overhead is heavier.
  1433. \revise{We perform a coordinate descent on SEL~\cite{lee2017semantic} dataset
  1434. to find proper intervals that are computationally efficient and functionally
  1435. effective.
  1436. %
  1437. Note that we use the EA-score as line similarity measure since its simplicity.
  1438. %
  1439. In the first round, we fix the angular quantization interval to $\Delta\theta=\pi/100$
  1440. and then search for $\Delta r$,
  1441. the results are shown in ~\cref{fig:ablation-interval}(a).
  1442. %
  1443. According to ~\cref{fig:ablation-interval}(a),
  1444. the performance first rises slowly and then drops down with the decrease of $\Delta r$,
  1445. and the turning point is near $\Delta r = \sqrt{2}$.
  1446. %
  1447. In the second round, we fix $\Delta r = \sqrt{2}$ and train with different $\Delta\theta$.
  1448. %
  1449. Similar to ~\cref{fig:ablation-interval}(a),
  1450. the results in~\cref{fig:ablation-interval}(b) demonstrate that
  1451. the performance first increases smoothly with the drop of $\Delta \theta$,
  1452. and then quickly decreases with vibration.
  1453. %
  1454. Therefore, the turning point $\Delta\theta=\pi/100$ is a proper choice for angular quantization.}
  1455. In summary, we use $\Delta\theta=\pi/100$ and $\Delta r = \sqrt{2}$
  1456. in quantization, and corresponding quantization levels are:
  1457. \begin{equation}
  1458. % \begin{split}
  1459. \Theta = 100, \ R = \sqrt{\frac{W^2+H^2}{2}},
  1460. % \end{split}
  1461. \label{eq:intervals}
  1462. \end{equation}
  1463. where $H, W$ are the size of feature maps to be transformed in DHT.
  1464. %-----------------------------------------------------------------------------------%
  1465. \subsection{Quantitative Comparisons} \label{sec:quantitative}
  1466. %-----------------------------------------------------------------------------------%
  1467. We compare our proposed method with the SLNet~\cite{lee2017semantic}
  1468. and the classical Hough line detection~\cite{duda1971use} with HED~\cite{xie2015holistically}
  1469. as the edge detector.
  1470. %
  1471. Note that we train the HED edge detector on the SEL~\cite{lee2017semantic} training set
  1472. using the line annotations as edge ground-truth.
  1473. \subsubsection{Results on SEL dataset}
  1474. \cref{tab:quantitative} summarizes the results on the SEL dataset~\cite{lee2017semantic}.
  1475. %
  1476. With either VGG16 or ResNet50 as the backbone, Our proposed method consistently outperforms
  1477. SLNet and HT+HED with a considerable margin.
  1478. %
  1479. In addition to~\cref{tab:quantitative}, we plot the F-measure \emph{v.s.} threshold and
  1480. the precision~\emph{v.s.} recall curves.
  1481. %
  1482. \cref{fig:curves} reveals that our method achieves higher F-measure than others
  1483. under a wide range of thresholds.
  1484. % \CheckRmv{
  1485. % \begin{table*}[!htb]
  1486. % \renewcommand{\arraystretch}{1.3}
  1487. % \renewcommand\tabcolsep{8.0pt}
  1488. % \centering
  1489. % \caption{
  1490. % Quantitative comparisons
  1491. % on the SEL~\cite{lee2017semantic} and NKL dataset.
  1492. % %
  1493. % On SEL~\cite{lee2017semantic} dataset, our method \revise{(without ER)} significantly outperforms other competitors in terms of
  1494. % average F-measure.
  1495. % `CD', `EMD' and `EA' are different evaluation metrics described in \cref{sec:metric}.
  1496. % }\vspace{-6pt}
  1497. % % \resizebox{0.97\textwidth}{!}{
  1498. % \begin{tabular}{l|c|ccc|ccc|ccc}
  1499. % \toprule
  1500. % \multirow{2}{*}{Dataset} & \multirow{2}{*}{Method} & \multicolumn{3}{c|}{Avg. Precision} & \multicolumn{3}{c|}{Avg. Recall} & \multicolumn{3}{c}{Avg. F-measure} \\
  1501. % & & CD & EMD & EA & CD & EMD & EA & CD & EMD & EA \\
  1502. % \hline
  1503. % \multirow{6}{*}{SEL~\cite{lee2017semantic}} &
  1504. % HED~\cite{xie2015holistically} + HT~\cite{duda1971use} & 0.491 & 0.461 & 0.356 & 0.578 & 0.543 & 0.420 & 0.531 & 0.498 & 0.385 \\
  1505. % & SLNet-iter1~\cite{lee2017semantic} & 0.740 & 0.723 & 0.654 & \textbf{0.905} & \textbf{0.888} & \textbf{0.803} & 0.812 & 0.797 & 0.721 \\
  1506. % % & SLNet-iter3~\cite{lee2017semantic} & & & 0.731 & & & 0.755 & & & 0.743 & 1.92\\
  1507. % & SLNet-iter5~\cite{lee2017semantic} & 0.826 & 0.810 & 0.735 & 0.841 & 0.824 & 0.747 & 0.834 & 0.817 & 0.741 \\
  1508. % & SLNet-iter10~\cite{lee2017semantic} & 0.858 & 0.840 & 0.762 & 0.821 & 0.804 & 0.729 & 0.839 & 0.822 & 0.745 \\
  1509. % % \cline{2-12}
  1510. % & Ours (VGG16) & 0.841 & 0.830& 0.756 & 0.835& 0.824 & 0.774 & 0.838 & 0.827 & 0.765\\
  1511. % % \cline{2-12}
  1512. % % Ours(FPN-ResNet50) & 0.883 & 0.835 & 0.858 \\
  1513. % & Ours (ResNet50) & \textbf{0.886} & \textbf{0.878} & \textbf{0.819} & 0.815 & 0.807 & 0.755 & \textbf{0.849} & \textbf{0.841} & \textbf{0.786} \\
  1514. % \hline
  1515. % \multirow{3}{*}{NKL} &
  1516. % HED~\cite{xie2015holistically} + HT~\cite{duda1971use} & 0.301 & - & 0.213 & 0.878 & - & 0.622 & 0.448 & - & 0.318 \\
  1517. % & Ours (VGG16) & 0.750 & 0.726 & 0.659 & 0.864 & 0.837 & 0.759 & 0.803 & 0.778 & 0.706 \\
  1518. % & Ours (ResNet50) & 0.766 & 0.743 & 0.679 & 0.864 & 0.839 & 0.766 & 0.812 & 0.789 & 0.719 \\
  1519. % \bottomrule
  1520. % %-----------------------------------------------%
  1521. % \end{tabular}
  1522. % \label{tab:quantitative}
  1523. % \end{table*}
  1524. % }
  1525. \CheckRmv{
  1526. \begin{table*}[!htb]
  1527. \renewcommand{\arraystretch}{1.3}
  1528. \renewcommand\tabcolsep{8.0pt}
  1529. \centering
  1530. % \color{blue}
  1531. \caption{
  1532. Quantitative comparisons
  1533. on the SEL~\cite{lee2017semantic} and NKL dataset.
  1534. %
  1535. On SEL~\cite{lee2017semantic} dataset, our method (without ER) significantly outperforms other competitors in terms of
  1536. average F-measure.
  1537. `CD,' `EMD,' and `EA' are different evaluation metrics described in \cref{sec:metric}.
  1538. }\vspace{-6pt}
  1539. % \resizebox{0.97\textwidth}{!}{
  1540. \begin{tabular}{l|c|ccc|ccc|ccc}
  1541. \toprule
  1542. \multirow{2}{*}{Dataset} & \multirow{2}{*}{Method} & \multicolumn{3}{c|}{CD} & \multicolumn{3}{c|}{EMD} & \multicolumn{3}{c}{EA} \\
  1543. & & Avg. P & Avg. R & Avg. F & Avg. P & Avg. R & Avg. F & Avg. P & Avg. R & Avg. F \\
  1544. \hline
  1545. \multirow{6}{*}{SEL~\cite{lee2017semantic}} &
  1546. HED~\cite{xie2015holistically} + HT~\cite{duda1971use} & 0.491 & 0.578 & 0.531 & 0.461 & 0.543 & 0.498 & 0.356 & 0.420 & 0.385 \\
  1547. & SLNet-iter1~\cite{lee2017semantic} & 0.740 & \textbf{0.905} & 0.812 & 0.723 & \textbf{0.888} & 0.797 & 0.654 & \textbf{0.803} & 0.721 \\
  1548. % & SLNet-iter3~\cite{lee2017semantic} & & & 0.731 & & & 0.755 & & & 0.743 & 1.92\\
  1549. & SLNet-iter5~\cite{lee2017semantic} & 0.826 & 0.841 & 0.834 & 0.810 & 0.824 & 0.817 & 0.735 & 0.747 & 0.741 \\
  1550. & SLNet-iter10~\cite{lee2017semantic} & 0.858 & 0.821 & 0.839 & 0.840 & 0.804 & 0.822 &0.762 & 0.729 & 0.745 \\
  1551. % \cline{2-12}
  1552. & Ours (VGG16) & 0.841 & 0.835 & 0.838 & 0.830 & 0.824 & 0.827 & 0.756 & 0.774 & 0.765\\
  1553. % \cline{2-12}
  1554. % Ours(FPN-ResNet50) & 0.883 & 0.835 & 0.858 \\
  1555. & Ours (ResNet50) & \textbf{0.886} & 0.815 & \textbf{0.849} & \textbf{0.878} & 0.807 & \textbf{0.841} & \textbf{0.819} & 0.755 & \textbf{0.786} \\
  1556. \hline
  1557. \multirow{3}{*}{NKL} &
  1558. HED~\cite{xie2015holistically} + HT~\cite{duda1971use} & 0.301 & 0.878 & 0.448 & - & - & - & 0.213 & 0.622 & 0.318 \\
  1559. & Ours (VGG16) & 0.750 & 0.864 & 0.803 & 0.726 & 0.837 & 0.778 & 0.659 & 0.759 & 0.706 \\
  1560. & Ours (ResNet50) & 0.766 & 0.864 & 0.812 & 0.743 & 0.839 & 0.789 & 0.679 & 0.766 & 0.719 \\
  1561. \bottomrule
  1562. %-----------------------------------------------%
  1563. \end{tabular}
  1564. \label{tab:quantitative}
  1565. \end{table*}
  1566. }
  1567. \CheckRmv{
  1568. \begin{figure*}[!htb]
  1569. \begin{center}
  1570. \begin{overpic}[width=0.9\linewidth]{figures/fpr-curve-pami.pdf}
  1571. \end{overpic}
  1572. \end{center}\vspace{-10pt}
  1573. \caption{
  1574. Left: F-measure under various thresholds.
  1575. Right: The precision-recall curve.
  1576. %
  1577. Out method outperforms SLNet~\cite{lee2017semantic} and classical
  1578. Hough transform~\cite{duda1971use} with a
  1579. considerable margin.
  1580. %
  1581. Moreover, even with 10 rounds of location refinement,
  1582. SLNet still presents inferior performance.
  1583. }\vspace{20pt}
  1584. \label{fig:curves}
  1585. \end{figure*}
  1586. }
  1587. \CheckRmv{
  1588. \begin{figure*}[htb!]
  1589. \begin{center}
  1590. \begin{overpic}[width=1.0 \linewidth]{figures/visual.jpg}
  1591. \put(0.5, 5){GT}
  1592. \put(0, 19){Ours}
  1593. \put(-1, 34.5){SLNet}
  1594. \put(-1.3, 32){(iter10)}
  1595. % \put(0, 29){\cite{lee2017semantic}}
  1596. \put(-1, 47){SLNet}
  1597. \put(-0.5, 44.5){(iter1)}
  1598. % \put(0, 41.5){\cite{lee2017semantic}}
  1599. % \put(-1, 59){SLNet}
  1600. % \put(-0.5, 56.5){\scriptsize (iter1)}
  1601. % \put(0, 41.5){\cite{lee2017semantic}}
  1602. \put(-1, 59){HED}
  1603. % \put(0, 58.5){\cite{xie2015holistically}}
  1604. \put(-1, 56.5){+ HT}
  1605. % \put(0.5, 53){\cite{duda1971use}}
  1606. \end{overpic}
  1607. \end{center}
  1608. \vspace{-12pt}
  1609. \caption{
  1610. Example detection results of different methods on the SEL dataset.
  1611. %
  1612. Compared to SLNet~\cite{lee2017semantic}
  1613. and classical Hough transform~\cite{duda1971use},
  1614. our results are more consistent with the ground-truth.
  1615. }
  1616. \label{fig:detections-SEL}
  1617. \end{figure*}
  1618. }
  1619. \subsubsection{Results on the \revise{NKL} dataset}
  1620. We report the performance of our newly constructed \revise{NKL} dataset.
  1621. %
  1622. Since SLNet~\cite{lee2017semantic} did not release the training code,
  1623. we only compare our method with HED+HT.
  1624. %
  1625. \rerevise{As shown in ~\cref{tab:quantitative}, our proposed method outperforms
  1626. the baseline method (HED edge detector + Hough transform) with a clear margin.}
  1627. % \CheckRmv{
  1628. % \begin{table}[!htb]
  1629. % \renewcommand{\arraystretch}{1.3}
  1630. % \renewcommand\tabcolsep{5.0pt}
  1631. % \centering
  1632. % \caption{
  1633. % Quantitative results on the \revise{NKL dataset (without ER)} .
  1634. % }\vspace{-6pt}
  1635. % % \resizebox{0.97\textwidth}{!}{
  1636. % \begin{tabular}{l|c|c|c}
  1637. % \toprule
  1638. % Method & \makecell{Avg. \\ Precision} & \makecell{Avg. \\ Recall} & \makecell{Avg. \\ F-measure}\\
  1639. % \hline
  1640. % HED~\cite{xie2015holistically} + HT~\cite{duda1971use} & \revise{0.213} & \revise{0.622} & \revise{0.318} \\
  1641. % Ours (VGG16) & \revise{0.659} & \revise{0.759} & \revise{0.706}\\
  1642. % Ours (ResNet50) & \revise{0.679} & \revise{0.766} & \revise{0.719}\\
  1643. % \bottomrule
  1644. % %-----------------------------------------------%
  1645. % \end{tabular}
  1646. % \label{tab:quantitative-nkl}
  1647. % \end{table}
  1648. % }
  1649. \CheckRmv{
  1650. \begin{figure*}[tb!]
  1651. \begin{center}
  1652. \begin{overpic}[width=1.0 \linewidth]{figures/sl6500_results.pdf}
  1653. \end{overpic}
  1654. \end{center}
  1655. \vspace{-12pt}
  1656. \caption{
  1657. Detection results of our method on the \revise{NKL} dataset.
  1658. %
  1659. Our method produces results that are visually compatible with human perception.
  1660. }
  1661. \label{fig:detections-NKL}
  1662. \end{figure*}
  1663. }
  1664. %-----------------------------------------------------------------------------------%
  1665. \subsubsection{Runtime efficiency.}
  1666. In this section, we benchmark the runtime of different methods including SLNet~\cite{lee2017semantic}
  1667. with various iteration steps, classical Hough transform and our proposed method.
  1668. Both SLNet~\cite{lee2017semantic} and HT require HED~\cite{xie2015holistically}
  1669. edge detector as a preprocessing step.
  1670. %
  1671. The non-maximal suppression (NMS) in SLNet requires edge maps as guidance,
  1672. %
  1673. and the classical Hough transform takes an edge map as input.
  1674. %
  1675. Moreover, SLNet uses a refining network to enhance the results iteratively.
  1676. Therefore, the inference speed is related to the iteration steps.
  1677. %
  1678. In contrast, our method produces output results with a single forward pass,
  1679. and the NMS is as simple as computing the centroids of each connected area
  1680. in the parametric space.
  1681. Results in~\cref{tab:speed} illustrate that our method is significantly faster than
  1682. all other competitors with a very considerable margin.
  1683. %
  1684. Even with only 1 iteration step, SLNet is still slower than our method.
  1685. \CheckRmv{
  1686. \begin{table}[!htb]
  1687. \renewcommand{\arraystretch}{1.4}
  1688. \renewcommand\tabcolsep{4.4pt}
  1689. \centering
  1690. \caption{
  1691. Quantitative speed comparisons.
  1692. %
  1693. Our method \revise{(without ER)} is much faster than the other two competitors in network forward.
  1694. %
  1695. Furthermore, our method doesn't require any extra-process \eg edge detection.
  1696. %
  1697. As a result, our method can run at 49 FPS, which is remarkably
  1698. higher than the other two methods.
  1699. }\vspace{-6pt}
  1700. \label{tab:speed}
  1701. \begin{tabular}{l|c|c|c|c}
  1702. \toprule
  1703. Method & Network forward & NMS & Edge & Total \\
  1704. \hline
  1705. SLNet-iter1~\cite{lee2017semantic} & 0.354\ s & 0.079\ s & 0.014\ s & 0.447\ s \\
  1706. SLNet-iter3~\cite{lee2017semantic} & 0.437\ s & 0.071\ s & 0.014\ s & 0.522\ s \\
  1707. SLNet-iter10~\cite{lee2017semantic} & 0.827\ s & 0.068\ s & 0.014\ s & 0.909\ s \\
  1708. HED~\cite{xie2015holistically} + HT~\cite{duda1971use} & 0.014\ s & 0.117\ s & 0.024\ s & 0.155\ s \\
  1709. \hline
  1710. Ours (VGG16) & 0.03\ s & 0.003\ s & 0 & 0.033\ s \\
  1711. \hline
  1712. Ours (ResNet50) & 0.017\ s & 0.003\ s & 0 & \textbf{0.020\ s} \\
  1713. \bottomrule
  1714. %-----------------------------------------------%
  1715. \end{tabular}
  1716. \end{table}
  1717. }
  1718. %-----------------------------------------------------------------------------------%
  1719. \subsection{Qualitative Comparisons } \label{sec:visual}
  1720. Here we give several example results of our proposed method along with SNLet
  1721. and HED+HT.
  1722. %
  1723. As shown in~\cref{fig:detections-SEL}, compared with other methods, our results are more compatible with
  1724. the ground-truth as well as human cognition.
  1725. %
  1726. In addition to the results in~\cref{fig:detections-SEL}, we provide all the detection results of our method
  1727. and SLNet \revise{in the supplementary materials}.
  1728. %-----------------------------------------------------------------------------------%
  1729. \subsection{Ablation Study} \label{sec:ablation}
  1730. In this section, we ablate each of the components in our method.
  1731. \subsubsection{Components in DHT} \label{sec:ablation-dht}
  1732. We first ablate components of ``deep Hough transform''.
  1733. %
  1734. Specifically, they are:
  1735. (a) the Deep Hough transform (DHT) module detailed in~\cref{sec:dht-dht};
  1736. (b) the multi-scale (MS) DHT architecture described in~\cref{sec:ms-dht-fpn};
  1737. (c) the context-aware (CTX) line detector proposed in~\cref{sec:ctx-line-detector}.
  1738. %
  1739. Experimental results are shown in~\cref{tab:ablation}.
  1740. We first construct a baseline model with plain ResNet50 and DHT module.
  1741. %
  1742. % Note that the baseline model achieves 0.845 average F-measure, which
  1743. % has already surpassed the SLNet competitor.
  1744. %
  1745. Then we verify the effectiveness of the multi-scale (MS) strategy and context-aware
  1746. line detector (CTX), individually.
  1747. %
  1748. We separately append MS and CTX to the baseline model and then evaluate their performance, respectively.
  1749. %
  1750. Results in~\cref{tab:ablation} indicate that both MS and CTX can improve
  1751. the performance of the baseline model.
  1752. At last, we combine all the components to form our final full method,
  1753. which achieves the best performance among all other combinations.
  1754. %
  1755. Experimental results in this section clearly demonstrate that each component of
  1756. our proposed method contributes to the success of our method.
  1757. \CheckRmv{
  1758. \begin{table}[!htb]
  1759. \renewcommand{\arraystretch}{1.3}
  1760. \renewcommand\tabcolsep{1.0pt}
  1761. \newcolumntype{C}{>{\centering\arraybackslash}p{0.11\textwidth}}
  1762. \centering
  1763. \caption{
  1764. Ablation study for each component. MS indicates DHTs with multi-scale features as described in~\cref{sec:ms-dht-fpn},
  1765. and CTX means context-aware aggregation as described in~\cref{sec:ctx-line-detector}.
  1766. }\vspace{-6pt}
  1767. % \resizebox{0.8\textwidth}{!}{
  1768. \begin{tabular}{C|C|C|C}
  1769. \toprule
  1770. DHT & MS & CTX & F-measure \\
  1771. % HED+HT & DHT+RHT & MS & CTX & mean \emph{F-measure} \\
  1772. \hline
  1773. % \checkmark & & & 0.846 \\
  1774. % \checkmark & & & & 0.829 \\
  1775. \checkmark & & & 0.664 \\
  1776. \checkmark & \checkmark & & 0.758 \\
  1777. \checkmark & & \checkmark & 0.771 \\
  1778. % \checkmark & \checkmark & & 0.852 \\
  1779. \checkmark & \checkmark & \checkmark & 0.786 \\
  1780. \bottomrule
  1781. %-----------------------------------------------%
  1782. \end{tabular}
  1783. \label{tab:ablation}
  1784. \end{table}
  1785. }
  1786. \subsubsection{Edge-guided Refinement} \label{sec:ablation-refinement}
  1787. Here we ablate the ``Edge-guided Refinement'' module \revise{(abbreviated as ER)}.
  1788. %
  1789. First, we test the performance of DHT+ER using different $\delta_r$.
  1790. %
  1791. The $\delta_r$ parameter controls the size of the searching space
  1792. in ER ($\mathcal{L}$ in ~\cref{eq:refine-search}).
  1793. %
  1794. This experiment is conducted on the SEL dataset using the ResNet50 backbone.
  1795. \CheckRmv{
  1796. \begin{table}[!htb]
  1797. \renewcommand{\arraystretch}{1.3}
  1798. \newcolumntype{C}{>{\centering\arraybackslash}p{0.08\textwidth}}
  1799. \centering
  1800. \caption{
  1801. Performance DHT+ER with different $\delta_r$.
  1802. %
  1803. Models are trained/tested on the SEL dataset using the Resnet50 backbone.
  1804. %
  1805. $\delta_r=0$ represents with vanilla DHT method without ER.
  1806. }\vspace{-6pt}
  1807. \newcommand{\CC}{\cellcolor{gray!20}}
  1808. \begin{tabular}{C|C|C|C}
  1809. \toprule
  1810. $\delta_r$ & Precision & Recall & F-measure\\
  1811. \hline
  1812. \CC 0 & \CC 0.8190 & \CC 0.7530 & \CC 0.7861 \\
  1813. 1 & 0.8199 & 0.7561 & 0.7866 \\
  1814. 3 & 0.8208 & 0.7569 & 0.7874 \\
  1815. 5 & 0.8214 & 0.7574 & 0.7880 \\
  1816. 7 & 0.8213 & 0.7573 & 0.7878 \\
  1817. 9 & 0.8212 & 0.7571 & 0.7877 \\
  1818. \bottomrule
  1819. %-----------------------------------------------%
  1820. \end{tabular}
  1821. \label{tab:ablation-refinement-1}
  1822. \end{table}
  1823. }
  1824. Results in ~\cref{tab:ablation-refinement-1} tells that the performance first increases and
  1825. then gets saturated with the growth of $\delta_r$.
  1826. %
  1827. Since the peak performance occurs when $\delta_r = 5$,
  1828. %
  1829. we set $\delta_r=5$ for better performance.
  1830. %
  1831. After setting $\delta_r$ to 5, we compare the performance of our method with and without
  1832. ER, using different backbones \revise{and} datasets.
  1833. \CheckRmv{
  1834. \begin{table}[!htb]
  1835. \renewcommand{\arraystretch}{1.3}
  1836. \centering
  1837. \caption{
  1838. Performance with and without ER ($\delta_r=5$) using different backbones \revise{and} datasets.
  1839. }\vspace{-6pt}
  1840. \newcommand{\CC}{\cellcolor{gray!20}}
  1841. \begin{tabular}{l|c|c|c|c|c|c}
  1842. \toprule
  1843. Dataset & Arch & Edge & P & R & F & F@0.95\\
  1844. \hline
  1845. \multirow{4}{*}{SEL~\cite{lee2017semantic}} & VGG16 & & 0.756 & 0.774 & 0.765 & 0.380\\
  1846. & \CC VGG16 & \CC \checkmark & \CC 0.758 & \CC 0.777 & \CC 0.770 & \CC 0.439 \\
  1847. & Resnet50 & & 0.819 & 0.753 & 0.786 & 0.420\\
  1848. & \CC Resnet50 & \CC \checkmark & \CC 0.821 & \CC 0.757 & \CC 0.788 & \CC 0.461\\
  1849. \hline
  1850. \multirow{4}{*}{\revise{NKL}} & VGG16 & & \revise{0.659} & \revise{0.759} & \revise{0.706} & \revise{0.434}\\
  1851. & \CC VGG16 & \CC\checkmark & \CC \revise{0.664} & \CC \revise{0.765} & \CC \revise{0.711} & \CC \revise{0.472}\\
  1852. & Resnet50 & & \revise{0.679} & \revise{0.766} & \revise{0.719} & \revise{0.459}\\
  1853. & \CC Resnet50 & \CC \checkmark & \CC \revise{0.684} & \CC \revise{0.771} & \CC \revise{0.725} & \CC \revise{0.486}\\
  1854. \bottomrule
  1855. %-----------------------------------------------%
  1856. \end{tabular}
  1857. \label{tab:ablation-refinement-2}
  1858. \end{table}
  1859. }
  1860. Results in ~\cref{tab:ablation-refinement-2} clearly demonstrate that
  1861. edge-guided refinement can effectively improve detection results regardless
  1862. of backbone architectures and datasets.
  1863. %-----------------------------------------------------------------------------------%
  1864. \section{Conclusions}\label{sec:conclusion}
  1865. %-----------------------------------------------------------------------------------%
  1866. In this paper, we proposed a simple yet effective method for semantic line detection in
  1867. natural scenes.
  1868. %
  1869. By incorporating the strong learning ability of CNNs into classical Hough transform,
  1870. our method is able to capture complex textures and rich contextual semantics of lines.
  1871. %
  1872. To better assess the similarity between a pair of lines,
  1873. we designed a new evaluation metric considering both Euclidean distance and angular distance
  1874. between lines.
  1875. %
  1876. Besides, a new dataset for semantic line detection
  1877. was constructed to fulfill the gap between the scale of existing datasets
  1878. and the complexity of modern CNN models.
  1879. %
  1880. Both quantitative and qualitative results revealed that
  1881. our method significantly outperforms previous arts in terms of both detection quality and speed.
  1882. \section*{Acknowledgment}
  1883. This research was supported by the Major Project for New Generation of AI
  1884. under Grant No. 2018AAA0100400, NSFC (61922046,61620106008,62002176),
  1885. S\&T innovation project from Chinese Ministry of Education,
  1886. and Tianjin Natural Science Foundation (17JCJQJC43700).
  1887. \bibliographystyle{IEEEtran}
  1888. \bibliography{line}
  1889. \ifCLASSOPTIONcaptionsoff
  1890. \newpage
  1891. \fi
  1892. \ArxivRmv{
  1893. \newcommand{\AddPhoto}[1]{\includegraphics%
  1894. [width=1in,height=1.25in,clip,keepaspectratio]{figures/photos/#1}}
  1895. \begin{IEEEbiography}[\AddPhoto{kai}]{Kai Zhao}
  1896. received his B.S. and M.S. from Shanghai University.
  1897. He is currently a Ph.D. Candidate with the
  1898. College of Computer Science, Nankai University, under the supervision of
  1899. Prof. Ming-Ming Cheng.
  1900. His research interests include statistical learning and computer vision.
  1901. \end{IEEEbiography}
  1902. \vspace{-.4in}
  1903. \begin{IEEEbiography}[\AddPhoto{hanqi}]{Qi Han}
  1904. is a master student from the College of Computer Science, Nankai University,
  1905. under the supervision of Prof. Ming-Ming Cheng.
  1906. He received his bachelor degree from Xidian University in 2019.
  1907. His research interests include deep learning and computer vision.
  1908. \end{IEEEbiography}
  1909. \vspace{-.4in}
  1910. \begin{IEEEbiography}[\AddPhoto{zhangchbin}]{Chang-Bin Zhang}
  1911. is a master student from the College of Computer Science at Nankai University,
  1912. under the supervision of Prof. Ming-Ming Cheng.
  1913. Before that, he received his bachelor degree from China University of Mining and Technology in 2019.
  1914. His research interests include deep learning and computer vision.
  1915. \end{IEEEbiography}
  1916. \vspace{-.4in}
  1917. \begin{IEEEbiography}[\AddPhoto{xujun}]{Jun Xu}
  1918. received his B.Sc. and M.Sc. degrees from School of Mathematics
  1919. Science, Nankai University in 2011 and 2014, and his Ph.D. degree from Department of Computing, The Hong Kong Polytechnic University, in 2018. He is a Lecturer with the School of Statistics and Data Science, Nankai University.
  1920. His homepage is \url{https://csjunxu.github.io/}.
  1921. \end{IEEEbiography}
  1922. \vspace{-.4in}
  1923. \begin{IEEEbiography}[\AddPhoto{cmm}]{Ming-Ming Cheng}
  1924. received his PhD degree
  1925. from Tsinghua University in 2012. Then he did
  1926. 2 years research fellow, with Prof. Philip Torr in
  1927. Oxford. He is now a professor at Nankai University, leading the Media Computing Lab. His
  1928. research interests includes computer graphics,
  1929. computer vision, and image processing. He received research awards including ACM China
  1930. Rising Star Award, IBM Global SUR Award,
  1931. CCF-Intel Young Faculty Researcher Program,
  1932. \etal .
  1933. \end{IEEEbiography}
  1934. }
  1935. \end{document}