@inproceedings{Yang2024thSORT,
  author={Mouzhi Yang, Peng Zhang, Jianbin Fang, Weifeng Liu, Chun Huang},
  journal={CCF Transactions on High Performance Computing},
  title={thSORT: An Efficient Parallel Sorting Algorithm on Multi‑core DSPs},
  year={2024},
  doi={10.1007/s42514-023-00175-7} }

@inproceedings{li2023haspmv,
  author={Wenxuan Li, Helin Cheng, Zhengyang Lu, Yuechen Lu, Weifeng Liu},
  title={HASpMV: Heterogeneity-Aware Sparse Matrix-Vector Multiplication on Modern Asymmetric Multicore Processors},
  booktitle={2023 IEEE International Conference on Cluster Computing (CLUSTER)},
  pages={1--12},
  year={2023},
  organization={IEEE} }

@inproceedings{liao2023exploiting,
  author={Jianjin Liao, Mingzhen Li, Hailong Yang, Qingxiao Sun, Biao Sun, Jiwei Hao, Tianyu Feng, Fengwei Yu, Shengdong Chen, Ye Tao, Zicheng Zhang, Zhongzhi Luan, Depei Qian},
  title={Exploiting Input Tensor Dynamics in Activation Checkpointing for Efficient Training on GPUIPDS},
  booktitle={2023 IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
  pages={1--15},
  year={2023},
  organization={IEEE} }

@ARTICLE{sun2023Adaptive,
  author={Qingxiao Sun, Yi Liu, Hailong Yang, Zhonghui Jiang, Zhongzhi Luan, and Depei Qian},
  journal={IEEE Transactions on Parallel and Distributed Systems},
  title={Adaptive Auto-tuning Framework for Global Exploration of Stencil Optimization on GPUs},
  year={2023},
  volume={33},
  number={1},
  pages={1-15},
  doi={10.1109/TPDS.2023.3325630},
}


@inproceedings{Lu2023DASP,
  author={Yuechen Lu and Liu, Weifeng},
  title={DASP: Specific Dense Matrix Multiply-Accumulate Units Accelerated General Sparse Matrix-Vector Multiplication},
  booktitle={36th ACM/IEEE International Conference for High Performance Computing, Networking, Storage, and Analysis (SC)},
  year={2023},
  organization={ACM/IEEE}, }

@inproceedings{Fu2023PanguLU,
  author={Xu Fu, Bingbin Zhang, Tengcheng Wang, Wenhao Li, Yuechen Lu, Enxin Yi, Jianqi Zhao, Xiaohan Geng, Fangying Li, Jingwen Zhang, Zhou Jin and Liu, Weifeng},
  title={PanguLU: A Scalable Regular Two-Dimensional Block-Cyclic Sparse Direct Solver on Distributed Heterogeneous Systems},
  booktitle={36th ACM/IEEE International Conference for High Performance Computing, Networking, Storage, and Analysis (SC)},
  year={2023},
  organization={ACM/IEEE}, }

@inproceedings{Lu2023TileSpTRSV,
  author={Lu, Zhengyang and Liu, Weifeng},
  journal={CCF Transactions on High Performance Computing},
  title={TileSpTRSV: a tiled algorithm for parallel sparse triangular solve on GPUs},
  year={2023},
  volume={5},
  number={2},
  pages={129-143},
  doi={10.1007/s42514-023-00151-1}, }

@inproceedings{Cheng2023HASpGEMM,
  author={Cheng, Helin and Li, Wenxuan and Lu, Yuechen and Liu, Weifeng},
  title={HASpGEMM: Heterogeneity-Aware Sparse General Matrix-Matrix Multiplication on Modern Asymmetric Multicore Processors},
  booktitle={2023 Proceedings of the 52nd International Conference on Parallel Processing (ICPP)},
  pages={807--817},
  year={2023}, }

@inproceedings{wang2023Accelerating,
  author={Wang, Tengcheng and Li, Wenhao and Pei, Haojie and Sun, Yuying and Jin, Zhou and Liu, Weifeng},
  title={Accelerating Sparse LU Factorization with Density-Aware Adaptive Matrix Multiplication for Circuit Simulation},
  booktitle={2023 60th ACM/IEEE Design Automation Conference (DAC)},
  pages={1--6},
  year={2023},
  organization={ACM/IEEE} }

@inproceedings{fan2023AmgR,
  author={Fan, Mingjia and Tian, Xiaotian and He, Yintao and Li, Junxian and Duan, Yiru and Hu, Xiaozhe and Wang, Ying and Jin, Zhou and Liu, Weifeng},
  title={AmgR: Algebraic Multigrid Accelerated on ReRAM},
  booktitle={2023 60th ACM/IEEE Design Automation Conference (DAC)},
  pages={1--6},
  year={2023},
  organization={ACM/IEEE} }

@inproceedings{mi2023balancing,
  author={Mi, Hongli and Yu, Xiangrui and Yu, Xiaosong and Wu, Shuangyuan and Liu, Weifeng},
  title={Balancing Computation and Communication in Distributed Sparse Matrix-Vector Multiplication},
  booktitle={2023 IEEE/ACM 23rd International Symposium on Cluster, Cloud and Internet Computing (CCGrid)},
  pages={535--544},
  year={2023},
  organization={IEEE} }

@inproceedings{zha2023Deeplearning,
  author={Zha, Xiaru and Pei, Haojie and Niu, Dan and Wu, Xiao and Jin, Zhou},
  title={Deep Learning Enhanced Time-Step Control in Pseudo Transient Analysis for Efficient Nonlinear DC Simulation},
  booktitle={2023 International Symposium of Electronics Design Automation (ISEDA)},
  pages={23--28},
  year={2023},
  organization={IEEE} }

@article{yu2018improving,
  author={Yu, Chao and Bai, Yuebin and Sun, Qingxiao and Yang, Hailong},
  title={Improving thread-level parallelism in GPUs through expanding register file to scratchpad memory},
  journal={ACM Transactions on Architecture and Code Optimization (TACO)}, volume={15},
  number={4},
  pages={1--24},
  year={2018},
  publisher={ACM New York, NY, USA} }

@inproceedings{dun2020accelerating,
  author={Dun, Ming and Li, Yunchun and You, Xin and Sun, Qingxiao and Luan, Zerong and Yang, Hailong},
  title={Accelerating De Novo Assembler WTDBG2 on Commodity Servers},
  booktitle={International Conference on Algorithms and Architectures for Parallel Processing},
  pages={232--246},
  year={2020},
  organization={Springer} }

@article{dun2021towards,
  author={Dun, Ming and Li, Yunchun and Sun, Qingxiao and Yang, Hailong and Li, Wei and Luan, Zhongzhi and Gan, Lin and Yang, Guangwen and Qian, Depei},
  title={Towards efficient canonical polyadic decomposition on sunway many-core processor},
  journal={Information Sciences},
  volume={549},
  pages={221--248},
  year={2021},
  publisher={Elsevier} }

@article{li2020deep,
  author={Li, Mingzhen and Liu, Yi and Liu, Xiaoyan and Sun, Qingxiao and You, Xin and Yang, Hailong and Luan, Zhongzhi and Gan, Lin and Yang, Guangwen and Qian, Depei},
  title={The deep learning compiler: A comprehensive survey},
  journal={IEEE Transactions on Parallel and Distributed Systems},
  volume={32},
  number={3},
  pages={708--727},
  year={2020},
  publisher={IEEE} }

@article{xiao2021highly,
  author={Xiao, Zhiyong and Liu, Xu and Xu, Jingheng and Sun, Qingxiao and Gan, Lin},
  title={Highly scalable parallel genetic algorithm on sunway many-core processors},
  journal={Future Generation Computer Systems},
  volume={114},
  pages={679--691},
  year={2021},
  publisher={Elsevier} }

@inproceedings{li2021automatic,
  author={Li, Mingzhen and Liu, Yi and Yang, Hailong and Hu, Yongmin and Sun, Qingxiao and Chen, Bangduo and You, Xin and Liu, Xiaoyan and Luan, Zhongzhi and Qian, Depei},
  title={Automatic code generation and optimization of large-scale stencil computation on many-core processors},
  booktitle={Proceedings of the 50th International Conference on Parallel Processing},
  pages={1--12},
  year={2021} }

@inproceedings{hao2022towards,
  author={Hao, Jiwei and Yang, Hailong and Sun, Qingxiao and Zhang, Huaitao and Luan, Zhongzhi and Qian, Depei},
  title={Towards Optimized Streaming Tensor Completion on multiple GPUs},
  booktitle={2022 IEEE 24th Int Conf on High Performance Computing \& Communications; 8th Int Conf on Data Science \& Systems; 20th Int Conf on Smart City; 8th Int Conf on Dependability in Sensor, Cloud \& Big Data Systems \& Application (HPCC/DSS/SmartCity/DependSys)},
  pages={1123--1128},
  year={2022},
  organization={IEEE} }

@inproceedings{dun2021optimized,
  author={Dun, Ming and Li, Yunchun and Yang, Hailong and Sun, Qingxiao and Luan, Zhongzhi and Qian, Depei},
  title={An optimized tensor completion library for multiple GPUs},
  booktitle={Proceedings of the ACM International Conference on Supercomputing},
  pages={417--430},
  year={2021} }

@inproceedings{sun2021cstuner,
  author={Sun, Qingxiao and Liu, Yi and Yang, Hailong and Jiang, Zhonghui and Liu, Xiaoyan and Dun, Ming and Luan, Zhongzhi and Qian, Depei},
  title={cstuner: Scalable auto-tuning framework for complex stencil computation on gpus},
  booktitle={2021 IEEE International Conference on Cluster Computing (CLUSTER)},
  pages={192--203},
  year={2021},
  organization={IEEE} }

@inproceedings{sun2022cognn,
  author={Sun, Qingxiao and Liu, Yi and Yang, Hailong and Zhang, Ruizhe and Dun, Ming and Li, Mingzhen and Liu, Xiaoyan and Xiao, Wencong and Li, Yong and Luan, Zhongzhi and others},
  title={CoGNN: efficient scheduling for concurrent GNN training on GPUs},
  booktitle={SC22: International Conference for High Performance Computing, Networking, Storage and Analysis},
  pages={1--15},
  year={2022},
  organization={IEEE} }

@inproceedings{sun2022stencilmart,
  author={Sun, Qingxiao and Liu, Yi and Yang, Hailong and Jiang, Zhonghui and Luan, Zhongzhi and Qian, Depei},
  title={Stencilmart: Predicting optimization selection for stencil computations across gpus},
  booktitle={2022 IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
  pages={875--885},
  year={2022},
  organization={IEEE} }

@article{sun2022qos,
  author={Sun, Qingxiao and Yi, Liu and Yang, Hailong and Li, Mingzhen and Luan, Zhongzhi and Qian, Depei},
  journal={Parallel Computing},
  title={QoS-aware dynamic resource allocation with improved utilization and energy efficiency on GPU},
  volume={113},
  pages={102958},
  year={2022},
  publisher={Elsevier} }

@inproceedings{sun2020sptfs,
  author={Sun, Qingxiao and Liu, Yi and Dun, Ming and Yang, Hailong and Luan, Zhongzhi and Gan, Lin and Yang, Guangwen and Qian, Depei},
  title={Sptfs: Sparse tensor format selection for mttkrp via deep learning},
  booktitle={SC20: International Conference for High Performance Computing, Networking, Storage and Analysis},
  pages={1--14},
  year={2020},
  organization={IEEE} }

@article{sun2021input,
  author={Sun, Qingxiao and Liu, Yi and Yang, Hailong and Dun, Ming and Luan, Zhongzhi and Gan, Lin and Yang, Guangwen and Qian, Depei},
  title={Input-aware sparse tensor storage format selection for optimizing mttkrp}, Yang, Guangwen and Qian, Depei},
  journal={IEEE Transactions on Computers},
  volume={71},
  number={8},
  pages={1968--1981},
  year={2021},
  publisher={IEEE} }

@inproceedings{sun2019smqos,
  author={Sun, Qingxiao and Liu, Yi and Yang, Hailong and Luan, Zhongzhi and Qian, Depei},
  title={Smqos: Improving utilization and energy efficiency with qos awareness on gpus},
  booktitle={2019 IEEE International Conference on Cluster Computing (CLUSTER)},
  pages={1--5},
  year={2019},
  organization={IEEE} }

@inproceedings{jin2024machine,
  author={Jin, Zhou and Li, Wenhao and Bai, Yinuo Bai and Wang, Tengcheng and Lu, Yicheng and Liu, Weifeng},
  title={Machine Learning and GPU Accelerated Sparse Linear Solvers for Transistor-Level Circuit Simulation: A Perspective Survey (Invited Paper)},
  booktitle={2024 29th Asia and South Pacific Design Automation Conference (ASP-DAC)},
  pages={96--101},
  year={2024},
  organization={IEEE} }

@inproceedings{chen2022application,
  author={Chen, Yufei and Pei, Haojie and Dong, Xiao and Jin, Zhou and Zhuo, Cheng},
  title={Application of deep learning in back-end simulation: Challenges and opportunities},
  booktitle={2022 27th Asia and South Pacific Design Automation Conference (ASP-DAC)},
  pages={641--646},
  year={2022},
  organization={IEEE} }

@inproceedings{2022-ji-tilespmspv,
  author = {Ji, Haonan and Song, Huimin and Lu, Shibo and Jin, Zhou and Tan, Guangming and Liu, Weifeng},
  title = {TileSpMSpV: A Tiled Algorithm for Sparse Matrix-Sparse Vector Multiplication on GPUs},
  year = {2023},
  publisher = {Association for Computing Machinery},
  doi = {10.1145/3545008.3545028},
  booktitle = {Proceedings of the 51st International Conference on Parallel Processing},
  numpages = {11},
  series = {ICPP '22}
}

@ARTICLE{2023-jin-ossp,
  author={Niu, Dan and Dong, Yichao and Jin, Zhou and Zhang, Chuan and Li, Qi and Sun, Changyin},
  journal={IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems},
  title={OSSP-PTA: An Online Stochastic Stepping Policy for PTA on Reinforcement Learning},
  year={2023},
  doi={10.1109/TCAD.2023.3251731},
}

@inbook{2022-dong-PTA,
  author = {Dong, Yichao and Niu, Dan and Jin, Zhou and Zhang, Chuan and Li, Qi and Sun, Changyin},
  title = {Adaptive Stepping PTA for DC Analysis Based on Reinforcement Learning},
  journal = {{IEEE Transactions on Circuits and Systems II: Express Briefs},
  year={2023},
  volume={70},
  doi={10.1109/TCSII.2022.3207356},
}


@inbook{2022-xing-BoA-PTA,
  author = {Xing, Wei W. and Jin, Xiang and Feng, Tian and Niu, Dan and Zhao, Weishen and Jin, Zhou},
  title = {BoA-PTA, An Bayesian Optimization Accelerated PTA Solver for SPICE Simulation},
  year = {2022},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3555805},
  journal = {ACM Trans. Des. Autom. Electron. Syst.},
  month = {jul},
  issn = {1084-4309},
  doi = {10.1145/3555805},
}


@inbook{2022-jin-RLPTA,
  author = {Jin, Zhou and Pei, Haojie and Dong, Yichao and Jin, Xiang and Wu, Xiao and Xing, Wei W. and Niu, Dan},
  title = {Accelerating Nonlinear DC Circuit Simulation with Reinforcement Learning},
  year = {2022},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3489517.3530512},
  doi = {10.1145/3489517.3530512},
  booktitle = {Proceedings of the 59th ACM/IEEE Design Automation Conference},
  pages = {619–624},
  numpages = {6},
  series = {DAC '22},
}


@inbook{2021-jin-PALBBD,
  author = {Jin, Zhou and Feng, Tian and Duan, Yiru and Wu, Xiao and Cheng, Minghou and Zhou, Zhenya and Liu, Weifeng},
  title = {PALBBD: A Parallel ArcLength Method Using Bordered Block Diagonal Form for DC Analysis},
  year = {2021},
  isbn = {9781450383936},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3453688.3461526},
  booktitle = {Proceedings of the 2021 on Great Lakes Symposium on VLSI},
  pages = {327–332},
  numpages = {6},
}


@inproceedings{2022-niu-tilespgemm,
  author = {Niu, Yuyao and Lu, Zhengyang and Ji, Haonan and Song, Shuhui and Jin, Zhou and Liu, Weifeng},
  title = {TileSpGEMM: A Tiled Algorithm for Parallel Sparse General Matrix-Matrix Multiplication on GPUs},
  year = {2022},
  isbn = {9781450392044},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3503221.3508431},
  {Proceedings of the 27th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
  pages = {90-106},
  numpages = {17},
  series = {PPoPP '22'}
}


@ARTICLE{lu2021implementing,
  author={Lu, Yuechen and Luo, Yuchen and Lian, Haocheng and Jin, Zhou and Liu, Weifeng},
  journal={CCF Transactions on High Performance Computing},
  title={Implementing LU and Cholesky factorizations on artificial intelligence accelerators},
  year={2021},
  volume={3},
  number={3},
  pages={286-297},
  doi={10.1007/s42514-021-00075-8},
}


@INPROCEEDINGS{9586141,
  author={Zhao, Jianqi and Wen, Yao and Luo, Yuchen and Jin, Zhou and Liu, Weifeng and Zhou, Zhenya},
  booktitle={2021 58th ACM/IEEE Design Automation Conference (DAC)},
  title={SFLU: Synchronization-Free Sparse LU Factorization for Fast Circuit Simulation on GPUs},
  year={2021},
  pages={37-42},
  doi={10.1109/DAC18074.2021.9586141},
}


@ARTICLE{9459513,
  author={Xie, Zhen and Tan, Guangming and Liu, Weifeng and Sun, Ninghui},
  journal={IEEE Transactions on Parallel and Distributed Systems},
  title={A Pattern-Based SpGEMM Library for Multi-Core and Many-Core Architectures},
  year={2022},
  volume={33},
  number={1},
  pages={159-175},
  doi={10.1109/TPDS.2021.3090328},
}


@ARTICLE{9380961,
  author={Zhang, Feng and Su, Jiya and Liu, Weifeng and He, Bingsheng and Wu, Ruofan and Du, Xiaoyong and Wang, Rujia},
  journal={IEEE Transactions on Parallel and Distributed Systems},
  title={YuenyeungSpTRSV: A Thread-Level and Warp-Level Fusion Synchronization-Free Sparse Triangular Solve},
  year={2021},
  volume={32},
  number={9},
  pages={2321-2337},
  doi={10.1109/TPDS.2021.3066635},
}


@ARTICLE{9373912,
  author={Chen, Jing and Fang, Jianbin and Liu, Weifeng and Yang, Canqun},
  journal={IEEE Transactions on Parallel and Distributed Systems},
  title={BALS: Blocked Alternating Least Squares for Parallel Sparse Matrix Factorization},
  year={2021},
  volume={32},
  number={9},
  pages={2291-2302},
  doi={10.1109/TPDS.2021.3064942}
}


@Article{Ji2021,
  author="Ji, Haonan and Lu, Shibo and Hou, Kaixi and Wang, Hao and Jin, Zhou and Liu, Weifeng and Vinter, Brian",
  title="Segmented Merge: A New Primitive for Parallel Sparse Matrix Computations",
  journal="International Journal of Parallel Programming",
  year="2021",
  month="3",
  day="26",
  volume="",
  number="",
  pages="",
  issn="1573-7640",
}


@INPROCEEDINGS{9460505,
  author={Niu, Yuyao and Lu, Zhengyang and Dong, Meichen and Jin, Zhou and Liu, Weifeng and Tan, Guangming},
  booktitle={2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
   title={TileSpMV: A Tiled Algorithm for Sparse Matrix-Vector Multiplication on GPUs},
  year={2021},
  volume={},
  number={},
  pages={68-78},
  doi={10.1109/IPDPS49936.2021.00016}
}


@inproceedings{10.1145/3404397.3404413,
  author = {Lu, Zhengyang and Niu, Yuyao and Liu, Weifeng},
  title = {Efficient Block Algorithms for Parallel Sparse Triangular Solve},
  year = {2020},
  isbn = {9781450388160},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3404397.3404413},
  doi = {10.1145/3404397.3404413},
  booktitle = {49th International Conference on
  Parallel Processing - ICPP},
  articleno = {63},
  numpages = {11},
  keywords = {block algorithm, sparse triangular solve, GPU, sparse matrix},
  location = {Edmonton, AB, Canada},
  series = {ICPP '20}
}


@inproceedings{10.1145/3404397.3404400,
  author = {Su, Jiya and Zhang, Feng and Liu, Weifeng and He, Bingsheng and Wu, Ruofan and Du, Xiaoyong and Wang, Rujia},
  title = {CapelliniSpTRSV: A Thread-Level Synchronization-Free Sparse Triangular Solve on GPUs},
  year = {2020},
  isbn = {9781450388160},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3404397.3404400},
  doi = {10.1145/3404397.3404400},
  booktitle = {49th International Conference on Parallel Processing - ICPP},
   articleno = {2},
  numpages = {11},
  location = {Edmonton, AB, Canada},
  series = {ICPP '20},
}


@InProceedings{10.1007/978-3-030-79478-1_20,
  author = {Yu, Xiaosong and Ma, Huihui and Qu, Zhengyu and Fang, Jianbin and Liu, Weifeng},
  editor={He, Xin and Shao, En and Tan Guangming},
  title = {NUMA-Aware Optimization of Sparse Matrix-Vector Multiplication on ARMv8-Based Many-Core Architectures},
  booktitle={Network and Parallel Computing},
  year = {2021},
   publisher={Springer International Publishing},
  address={Cham},
  pages={231--242},
  isbn={978-3-030-79478-1},
}


@inproceedings{xie2019spgemm,
  author = {Xie, Zhen and Tan, Guangming and Liu, Weifeng and Sun, Ninghui},
  title = {IA-SpGEMM: An Input-aware Auto-tuning Framework for Parallel Sparse Matrix-Matrix Multiplication},
  booktitle = {Proceedings of the 2019 International Conference on Supercomputing},
  series = {ICS '19},
  year = {2019},
  isbn = {},
  location = {},
  pages = {},
  numpages = {12},
  url = {},
  doi = {},
  acmid = {},
  address = {},
}


@Article{zhang2019apu,
  author="Feng Zhang and Weifeng Liu and Ningxuan Feng and Jidong Zhai and Xiaoyong Du",
  title="Performance Evaluation and Analysis of Sparse Matrix and Graph Kernels on Heterogeneous Processors",
  journal="CCF Transactions on High Performance Computing",
  year="2019",
  month="",
  day="",
  volume="",
  number="",
  pages="",
  issn="2524-4922",
}


@Article{liu2019spgemm,
  author="Liu, Junhong and He, Xin and Liu, Weifeng and Tan, Guangming",
  title="Register-Aware Optimizations for Parallel Sparse Matrix-Matrix Multiplication",
  journal="International Journal of Parallel Programming",
  year="2019",
  month="",
  day="",
  volume="",
  number="",
  pages="",
  issn="1573-7640",
}


@article {chen2018clmf,
  author = {Jing Chen and Jianbin Fang and Weifeng Liu and Tao Tang and Canqun Yang},
  title = {clMF: A Fine-Grained and Portable Alternating Least Squares Algorithm for Parallel Matrix Factorization},
  journal = {Future Generation Computer Systems},
  volume = {},
  number = {},
  pages = {},
  year = {2018},
}


@inproceedings{li2018warp,
  author = {Li, Ang and Liu, Weifeng and Wang, Linnan and Barker, Kevin and Song, Shuaiwen Leon},
  title = {Warp-Consolidation: A Novel Execution Model for GPUs},
  booktitle = {Proceedings of the 2018 International Conference on Supercomputing},
  series = {ICS '18},
  year = {2018},
  isbn = {978-1-4503-5783-8},
  location = {Beijing, China},
  pages = {53--64},
  numpages = {12},
  url = {http://doi.acm.org/10.1145/3205289.3205294},
  doi = {10.1145/3205289.3205294},
  acmid = {3205294},
  address = {New York, NY, USA},
}


@inproceedings{wang2018sptrsv,
  author = {Wang, Xinliang and Liu, Weifeng and Xue, Wei and Wu, Li},
  title = {swSpTRSV: A Fast Sparse Triangular Solve with Sparse Level Tile Layout on Sunway Architectures},
  booktitle = {Proceedings of the 23rd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
  series = {PPoPP '18},
  year = {2018},
  isbn = {978-1-4503-4982-6},
  location = {Vienna, Austria},
  pages = {338--353},
  numpages = {16},
  url = {http://doi.acm.org/10.1145/3178487.3178513},
  doi = {10.1145/3178487.3178513},
  acmid = {3178513},
  publisher = {ACM},
  address = {New York, NY, USA},
}


@inproceedings{liu2018spgemm,
  author = {Liu, Junhong and He, Xin and Liu, Weifeng and Tan, Guangming},
  title = {Register-based Implementation of the Sparse General Matrix-matrix Multiplication on GPUs},
  booktitle = {Proceedings of the 23rd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
  series = {PPoPP '18},
  year = {2018},
  isbn = {978-1-4503-4982-6},
  location = {Vienna, Austria},
  pages = {407--408},
  numpages = {2},
  url = {http://doi.acm.org/10.1145/3178487.3178529},
  doi = {10.1145/3178487.3178529},
  acmid = {3178529},
  publisher = {ACM},
  address = {New York, NY, USA},
}


@inproceedings{li2017hbm,
  author = {Li, Ang and Liu, Weifeng and Kristensen, Mads R. B. and Vinter, Brian and Wang, Hao and Hou, Kaixi and Marquez, Andres and Song, Shuaiwen Leon},
  title = {Exploring and Analyzing the Real Impact of Modern On-package Memory on HPC Scientific Kernels},
  booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
  series = {SC '17},
  year = {2017},
  isbn = {978-1-4503-5114-0},
  location = {Denver, Colorado},
  pages = {26:1--26:14},
  articleno = {26},
  numpages = {14},
  url = {http://doi.acm.org/10.1145/3126908.3126931},
  doi = {10.1145/3126908.3126931},
  acmid = {3126931},
  publisher = {ACM},
  address = {New York, NY, USA},
}


@article {liu2017sptrsv,
  author = {Liu, Weifeng and Li, Ang and Hogg, Jonathan D. and Duff, Iain S. and Vinter, Brian},
  title = {Fast Synchronization-Free Algorithms for Parallel Sparse Triangular Solves with Multiple Right-Hand Sides},
  journal = {Concurrency and Computation: Practice and Experience},
  volume = {29},
  number = {21},
  pages = {e4244--n/a},
  issn = {1532-0634},
  url = {http://dx.doi.org/10.1002/cpe.4244},
  doi = {10.1002/cpe.4244},
  year = {2017},
}


@inproceedings{hou2017segsort,
  author = {Hou, Kaixi and Liu, Weifeng and Wang, Hao and Feng, Wu-chun},
  title = {Fast Segmented Sort on GPUs},
  booktitle = {Proceedings of the International Conference on Supercomputing},
  series = {ICS '17},
  year = {2017},
  isbn = {978-1-4503-5020-4},
  location = {Chicago, Illinois},
  pages = {12:1--12:10},
  articleno = {12},
  numpages = {10},
  url = {http://doi.acm.org/10.1145/3079079.3079105},
  doi = {10.1145/3079079.3079105},
  acmid = {3079105},
  publisher = {ACM},
}


@inproceedings{li2017cta,
  author = {Li, Ang and Song, Shuaiwen Leon and Liu, Weifeng and Liu, Xu and Kumar, Akash and Corporaal, Henk},
  title = {Locality-Aware CTA Clustering for Modern GPUs},
  booktitle = {Proceedings of the Twenty-Second International Conference on Architectural Support for Programming Languages and Operating Systems},
  series = {ASPLOS '17},
  year = {2017},
  isbn = {978-1-4503-4465-4},
  location = {Xi'an, China},
  pages = {297--311},
  numpages = {15},
  url = {http://doi.acm.org/10.1145/3037697.3037709},
  doi = {10.1145/3037697.3037709},
  acmid = {3037709},
  publisher = {ACM},
}


@inproceedings{chen2017als,
  author = {Jing Chen and Jianbin Fang and Weifeng Liu and Tao Tang and Xuhao Chen and Canqun Yang},
  title = {Efficient and Portable ALS Matrix Factorization for Recommender Systems},
  booktitle = {Proceedings of the 6th International Workshop on Parallel and Distributed Computing for Large Scale Machine Learning and Big Data Analytics},
  series = {Parlearning '17},
  year = {2017},
  pages={409--418},
}


@inproceedings{liu2016sptrsv,
  author = {Liu, Weifeng and Li, Ang and Hogg, Jonathan and Duff, Iain S. and Vinter, Brian},
  title = {A Synchronization-Free Algorithm for Parallel Sparse Triangular Solves},
  booktitle = {Proceedings of the 22Nd International Conference on Euro-Par 2016: Parallel Processing - Volume 9833},
  year = {2016},
  isbn = {978-3-319-43658-6},
  pages = {617--630},
  numpages = {14},
  url = {http://dx.doi.org/10.1007/978-3-319-43659-3_45},
  doi = {10.1007/978-3-319-43659-3_45},
  acmid = {2990990},
  publisher = {Springer-Verlag New York, Inc.},
}


@inproceedings{wang2016sptrans,
  author = {Wang, Hao and Liu, Weifeng and Hou, Kaixi and Feng, Wu-chun},
  title = {Parallel Transposition of Sparse Data Structures},
  booktitle = {Proceedings of the 2016 International Conference on Supercomputing},
  series = {ICS '16},
  year = {2016},
  isbn = {978-1-4503-4361-9},
  location = {Istanbul, Turkey},
  pages = {33:1--33:13},
  numpages = {13},
  url = {http://doi.acm.org/10.1145/2925426.2926291},
  doi = {10.1145/2925426.2926291},
  publisher = {ACM},
}


@inproceedings{liu2015csr5,
  author = {Liu, Weifeng and Vinter, Brian},
  title = {CSR5: An Efficient Storage Format for Cross-Platform Sparse Matrix-Vector Multiplication},
  booktitle = {Proceedings of the 29th ACM International Conference on Supercomputing},
  series = {ICS '15},
  year = {2015},
  isbn = {978-1-4503-3559-1},
  location = {Newport Beach, California, USA},
  pages = {339--350},
  numpages = {12},
  url = {http://doi.acm.org/10.1145/2751205.2751209},
  doi = {10.1145/2751205.2751209},
  publisher = {ACM},
}


@article{liu2015spmv,
  author = {Liu, Weifeng and Vinter, Brian},
  title = {Speculative Segmented Sum for Sparse Matrix-vector Multiplication on Heterogeneous Processors},
  journal = {Parallel Computing},
  issue_date = {November 2015},
  volume = {49},
  number = {C},
  month = nov,
  year = {2015},
  issn = {0167-8191},
  pages = {179--193},
  numpages = {15},
  url = {https://doi.org/10.1016/j.parco.2015.04.004},
  doi = {10.1016/j.parco.2015.04.004},
}


@article{liu2015spgemm,
  author = {Liu, Weifeng and Vinter, Brian},
  title = {A Framework for General Sparse Matrix-Matrix Multiplication on GPUs and Heterogeneous Processors},
  journal = {Journal of Parallel and Distributed Computing},
  volume = {85},
  number = {C},
  month = nov,
  year = {2015},
  issn = {0743-7315},
  pages = {47--61},
  numpages = {15},
  url = {http://dx.doi.org/10.1016/j.jpdc.2015.06.010},
  doi = {10.1016/j.jpdc.2015.06.010},
}


@inproceedings{liu2014spgemm,
  author = {Liu, Weifeng and Vinter, Brian},
  title = {An Efficient GPU General Sparse Matrix-Matrix Multiplication for Irregular Data},
  booktitle = {Proceedings of the 2014 IEEE 28th International Parallel and Distributed Processing Symposium},
  series = {IPDPS '14},
  year = {2014},
  isbn = {978-1-4799-3800-1},
  pages = {370--381},
  numpages = {12},
  url = {http://dx.doi.org/10.1109/IPDPS.2014.47},
  doi = {10.1109/IPDPS.2014.47},
  publisher = {IEEE Computer Society},
}


@inproceedings{liu2014adheap,
  author = {Liu, Weifeng and Vinter, Brian},
  title = {Ad-heap: An Efficient Heap Data Structure for Asymmetric Multicore Processors},
  booktitle = {Proceedings of Workshop on General Purpose Processing Using GPUs},
  series = {GPGPU-7},
  year = {2014},
  isbn = {978-1-4503-2766-4},
  pages = {54:54--54:63},
  numpages = {10},
  url = {http://doi.acm.org/10.1145/2576779.2576786},
  doi = {10.1145/2576779.2576786},
  publisher = {ACM},
}


@phdthesis{phdthesisliu,
  author = {Weifeng Liu},
  title = {Parallel and Scalable Sparse Basic Linear Algebra Subprograms},
  year = 2015,
  school = {University of Copenhagen},
}


@article {ren2018trans,
  author={H. Ren and N. Kanhabua and A. M�gelmose and W. Liu and K. Kulkarni and S. Escalera and X. Bar� and T. B. Moeslund},
  journal={IET Computer Vision},
  title={Back-Dropout Transfer Learning for Action Recognition},
  year={2018},
  volume={12},
  number={4},
  pages={484--491},
  doi={10.1049/iet-cvi.2016.0309},
}


@inproceedings{ren2015dl,
  title={Unsupervised Behavior-Specific Dictionary Learning for Abnormal Event Detection},
  author={Huamin Ren and Weifeng Liu and S�ren Ingvor Olsen and Sergio Escalera and Thomas B. Moeslund},
  year={2015},
  month={September},
  pages={28.1--28.13},
  articleno={28},
  numpages={13},
  booktitle={Proceedings of the British Machine Vision Conference (BMVC)},
  publisher={BMVA Press},
  editor={Xianghua Xie, Mark W. Jones, and Gary K. L. Tam},
  doi={10.5244/C.29.28},
  isbn={1-901725-53-7},
  url={https://dx.doi.org/10.5244/C.29.28}
}