publications.bib

@inproceedings{lai2019improving,
  author = {Lai, Zhengmin and Yang, Zhengyi and Lai, Longbin},
  booktitle = {2019 IEEE 35th International Conference on Data Engineering Workshops (ICDEW)},
  title = {Improving Distributed Subgraph Matching Algorithm on Timely Dataflow},
  year = {2019},
  volume = {},
  number = {},
  pages = {266-273},
  doi = {10.1109/ICDEW.2019.000-2},
  abstract = {The subgraph matching problem is defined to find all subgraphs of a data graph that are isomorphic to a given query graph. Subgraph matching plays a vital role in the fields of e-commerce, social media and biological science. CliqueJoin is a distributed subgraph matching algorithm that is designed to be efficient and scalable. However, CliqueJoin is originally developed on MapReduce, thus the performance of the algorithm can be affected by the notorious I/O issue of MapReduce while processing multi-round join tasks. Meanwhile, CliqueJoin does not propose a cost evaluation strategy for labelled graphs, which limits its application in practice where most real-world graphs are labelled. Targeting the limitations of CliqueJoin, we propose CliqueJoin++ to improve CliqueJoin in two aspects. Firstly, we implement CliqueJoin++ on the Timely dataflow system instead of MapReduce to avoid considerable I/O cost. Secondly, we extend the cost evaluation function in CliqueJoin to compute optimal join plans for labelled graphs in the distributed context. Extensive experiments have been conducted to show that the proposed method is up to 10 times faster than the MapReduce version for unlabelled matching, and it achieves good performance and scalability for labelled matching.}
}
@inproceedings{lai2019distributed,
  author = {Lai, Longbin and Qing, Zhu and Yang, Zhengyi and Jin, Xin and Lai, Zhengmin and Wang, Ran and Hao, Kongzhang and Lin, Xuemin and Qin, Lu and Zhang, Wenjie and Zhang, Ying and Qian, Zhengping and Zhou, Jingren},
  title = {Distributed Subgraph Matching on Timely Dataflow},
  year = {2019},
  issue_date = {June 2019},
  publisher = {VLDB Endowment},
  volume = {12},
  number = {10},
  issn = {2150-8097},
  url = {https://doi.org/10.14778/3339490.3339494},
  doi = {10.14778/3339490.3339494},
  abstract = {Recently there emerge many distributed algorithms that aim at solving subgraph matching at scale. Existing algorithm-level comparisons failed to provide a systematic view of distributed subgraph matching mainly due to the intertwining of strategy and optimization. In this paper, we identify four strategies and three general-purpose optimizations from representative state-of-the-art algorithms. We implement the four strategies with the optimizations based on the common Timely dataflow system for systematic strategy-level comparison. Our implementation covers all representative algorithms. We conduct extensive experiments for both unlabelled matching and labelled matching to analyze the performance of distributed subgraph matching under various settings, which is finally summarized as a practical guide.},
  booktitle = {Proc. VLDB Endow. (VLDB)},
  pages = {1099-1112},
  numpages = {14}
}
@inproceedings{hao2019patmat,
  author = {Hao, Kongzhang and Yang, Zhengyi and Lai, Longbin and Lai, Zhengmin and Jin, Xin and Lin, Xuemin},
  title = {PatMat: A Distributed Pattern Matching Engine with Cypher},
  year = {2019},
  isbn = {9781450369763},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3357384.3357840},
  doi = {10.1145/3357384.3357840},
  abstract = {Graph pattern matching is one of the most fundamental problems in graph database and is associated with a wide spectrum of applications. Due to its computational intensiveness, researchers have primarily devoted their efforts to improving the performance of the algorithm while constraining the graphs to have singular labels on vertices (edges) or no label. Whereas in practice graphs are typically associated with rich properties, thus the main focus in the industry is instead on powerful query languages that can express a sufficient number of pattern matching scenarios. We demo PatMat in this work to glue together the academic efforts on performance and the industrial efforts on expressiveness. To do so, we leverage the state-of-the-art join-based algorithms in the distributed contexts and Cypher query language - the most widely-adopted declarative language for graph pattern matching. The experiments demonstrate how we are capable of turning complex Cypher semantics into a distributed solution with high performance.},
  booktitle = {Proceedings of the 28th ACM International Conference on Information and Knowledge Management (CIKM)},
  pages = {2921-2924},
  numpages = {4},
  keywords = {cypher, distributed processing, graph database, graph pattern matching, join optimization},
  location = {Beijing, China},
  series = {CIKM '19}
}
@inproceedings{wang2020empirical,
  author = {Wang, Ran and Yang, Zhengyi and Zhang, Wenjie and Lin, Xuemin},
  title = {An Empirical Study on Recent Graph Database Systems},
  year = {2020},
  isbn = {978-3-030-55129-2},
  publisher = {Springer International Publishing},
  address = {Berlin, Heidelberg},
  url = {https://doi.org/10.1007/978-3-030-55130-8_29},
  doi = {10.1007/978-3-030-55130-8_29},
  abstract = {Graphs are widely used to model the intricate relationships among objects in a wide range of applications. The advance in graph data has brought significant value to artificial intelligence technologies. Recently, a number of graph database systems have been developed. In this paper, we present a comprehensive overview and empirical investigation on existing property graph database systems such as Neo4j, AgensGraph, TigerGraph and LightGraph (LightGraph has recently renamed to TuGraph.). These systems support declarative graph query languages. Our empirical studies are conducted in a single-machine environment against on the LDBC social network benchmark, consisting of three different large-scale datasets and a set of benchmark queries. This is the first empirical study to compare these graph database systems by evaluating data bulk importing and processing simple and complex queries. Experimental results provide insightful observations of various graph data systems and indicate that AgensGraph works well on SQL based workload and simple update queries, TigerGraph is powerful on complex business intelligence queries, Neo4j is user-friendly and suitable for small queries, and LightGraph is a more balanced product achieving good performance on different queries. The related code, scripts and data of this paper are available online (https://github.com/UNSW-database/GraphDB-Benchmark).},
  booktitle = {13th International Conference Knowledge Science, Engineering and Management (KSEM)},
  pages = {328-340},
  numpages = {13},
  keywords = {LDBC benchmark, Graph database systems, Labeled property graph},
  location = {Hangzhou, China}
}
@inproceedings{jin2021fast,
  author = {Jin, Xin and Yang, Zhengyi and Lin, Xuemin and Yang, Shiyu and Qin, Lu and Peng, You},
  booktitle = {2021 IEEE 37th International Conference on Data Engineering (ICDE)},
  title = {FAST: FPGA-based Subgraph Matching on Massive Graphs},
  year = {2021},
  volume = {},
  number = {},
  pages = {1452-1463},
  doi = {10.1109/ICDE51399.2021.00129},
  abstract = {Subgraph matching is a basic operation widely used in many applications. However, due to its NP-hardness and the explosive growth of graph data, it is challenging to compute subgraph matching, especially in large graphs. In this paper, we aim at scaling up subgraph matching on a single machine using FPGAs. Specifically, we propose a CPU-FPGA co-designed framework. On the CPU side, we first develop a novel auxiliary data structure called candidate search tree (CST) which serves as a complete search space of subgraph matching. CST can be partitioned and fully loaded into FPGAs' on-chip memory. Then, a workload estimation technique is proposed to balance the load between the CPU and FPGA. On the FPGA side, we design and implement the first FPGA-based subgraph matching algorithm, called FAST. To take full advantage of the pipeline mechanism on FPGAs, task parallelism optimization and task generator separation strategy are proposed for FAST, achieving massive parallelism. Moreover, we carefully develop a BRAM-only matching process to fully utilize FPGA's on-chip memory, which avoids the expensive intermediate data transfer between FPGA's BRAM and DRAM. Comprehensive experiments show that FAST achieves up to 462.0x and 150.0x speedup compared with the state-of-the-art algorithm DAF and CECI, respectively. In addition, FAST is the only algorithm that can handle the billion-scale graph using one machine in our experiments.}
}
@inproceedings{yang2021huge,
  author = {Yang, Zhengyi and Lai, Longbin and Lin, Xuemin and Hao, Kongzhang and Zhang, Wenjie},
  title = {HUGE: An Efficient and Scalable Subgraph Enumeration System},
  year = {2021},
  isbn = {9781450383431},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3448016.3457237},
  doi = {10.1145/3448016.3457237},
  abstract = {Subgraph enumeration is a fundamental problem in graph analytics, which aims to find all instances of a given query graph on a large data graph. In this paper, we propose a system called HUGE to efficiently process subgraph enumeration at scale in the distributed context. HUGE features 1) an optimiser to compute an advanced execution plan without the constraints of existing works; 2) a hybrid communication layer that supports both pushing and pulling communication; 3) a novel two-stage execution mode with a lock-free and zero-copy cache design; 4) a BFS/DFS-adaptive scheduler to bound memory consumption; and 5) two-layer intra- and inter-machine load balancing. HUGE is generic such that all existing distributed subgraph enumeration algorithms can be plugged in to enjoy automatic speed up and bounded-memory execution.},
  booktitle = {Proceedings of the 2021 International Conference on Management of Data (SIGMOD)},
  pages = {2049-2062},
  numpages = {14},
  keywords = {subgraph enumeration, cache, join processing, load balancing, distributed graph processing, dynamic scheduling},
  location = {Virtual Event, China},
  series = {SIGMOD/PODS '21}
}
@inproceedings{li2021sql2cypher,
  author = {Li, Shunyang and Yang, Zhengyi and Zhang, Xianhang and Zhang, Wenjie and Lin, Xuemin},
  title = {SQL2Cypher: Automated Data and Query Migration from RDBMS to GDBMS},
  year = {2021},
  isbn = {978-3-030-91559-9},
  publisher = {Springer International Publishing},
  url = {https://doi.org/10.1007/978-3-030-91560-5_39},
  doi = {10.1007/978-3-030-91560-5_39},
  abstract = {There are many real-world application domains where data can be naturally modelled as a graph, such as social networks and computer networks. Relational Database Management Systems (RDBMS) find it hard to capture the relationships and inherent graph structure of data and are inappropriate for storing highly connected data; thus, graph databases have emerged to address the challenges of high data connectivity. As the performance of querying highly connected data in relational query statements is usually worse than that in the graph database. Transforming data from a relational database to a graph database is imperative for improving the performance of graph queries. In this paper, we demonstrate SQL2Cypher, a system for migrating data from a relational database to a graph database automatically. This system also supports translating SQL queries into Cypher queries. SQL2Cypher is open-source (https://github.com/UNSW-database/SQL2Cypher) to allow researchers and programmers to migrate data efficiently. Our demonstration video can be found here: https://www.youtube.com/watch?v=eGaeBrVTJws.},
  booktitle = {22nd International Conference on Web Information Systems Engineering (WISE)},
  pages = {510-517},
  numpages = {8},
  keywords = {Cypher, Data migration, GDBMS, SQL, RDBMS},
  location = {Melbourne, VIC, Australia}
}
@inproceedings{li2022hop,
  author = {Li, Xia and Hao, Kongzhang and Yang, Zhengyi and Cao, Xin and Zhang, Wenjie and Yuan, Long and Lin, Xuemin},
  title = {Hop-Constrained s-t Simple Path Enumeration in Billion-Scale Labelled Graphs},
  year = {2022},
  isbn = {978-3-031-20890-4},
  publisher = {Springer International Publishing},
  url = {https://doi.org/10.1007/978-3-031-20891-1_5},
  doi = {10.1007/978-3-031-20891-1_5},
  abstract = {Hop-constrained s-t simple path (HC-s-tpath) enumeration is a fundamental problem in graph analysis. Existing solutions for this problem focus on unlabelled graphs and assume queries are issued without any label constraints. However, in many real-world applications, graphs are edge-labelled and the queries involve label constraints on the path connecting two vertices. Therefore, we study the problem of labelled hop-constrained s-t path (LHC-s-tpath) enumeration in this paper. We aim to efficiently enumerate the HC-s-tpaths using only edges with provided labels. To achieve this goal, we first demonstrate the existence of unnecessary computation specific to the label constraints in the state-of-the-art HC-s-tpath enumeration algorithm. We then devise a novel online index to identify the fruitless exploration during the enumeration. Based on the proposed index, we design an efficient LHC-s-tpath enumeration algorithm in which unnecessary computation is effectively pruned. Extensive experiments are conducted on real-world graphs with billions of edges. Experiment results show that our proposed algorithms significantly outperform the baseline methods by over one order of magnitude.},
  booktitle = {Web Information Systems Engineering (WISE)},
  pages = {49-64},
  numpages = {16},
  keywords = {Path enumeration, Graph, Labelled graph},
  location = {Biarritz, France}
}
@inproceedings{ukey2022efficient,
  author = {Ukey, Nimish and Yang*, Zhengyi and Zhang, Guangjian and Liu, Boge and Li, Binghao and Zhang, Wenjie},
  title = {Efficient KNN Join Over Dynamic High-Dimensional Data},
  year = {2022},
  isbn = {978-3-031-15511-6},
  publisher = {Springer International Publishing},
  url = {https://doi.org/10.1007/978-3-031-15512-3_5},
  doi = {10.1007/978-3-031-15512-3_5},
  abstract = {Given a user dataset U and an object dataset I in high-dimensional space, a kNN join query retrieves each object in dataset U its k nearest neighbors from the dataset I. kNN join is a fundamental and essential operation in applications from many domains such as databases, computer vision, multi-media, machine learning, recommendation systems, and many more. The datasets in real world often update dynamically on insertion or deletion of objects. However, existing algorithms of dynamic kNN join lack support for deletion and batch update, which are important in real-life applications. In this paper, we propose a new method of kNN join over dynamic high-dimensional data. Specifically, our method features lazy updates, batch operations, and optimised deletions. Experiments on real-world datasets show that our method outperforms the existing algorithms of naive RkNN join and HDR Tree by up to 5 and 4 times, respectively.},
  booktitle = {Australasian Database Conference (ADC)},
  pages = {63-75},
  numpages = {13},
  keywords = {Dynamic data, High-dimensional data, kNN join},
  location = {Sydney, NSW, Australia}
}
@inproceedings{li2022hop2,
  author = {Li, Xia and Hao, Kongzhang and Yang, Zhengyi and Cao, Xin and Zhang, Wenjie},
  title = {Hop-Constrained s-t Simple Path Enumeration In Large Uncertain Graphs},
  year = {2022},
  isbn = {978-3-031-15511-6},
  publisher = {Springer International Publishing},
  url = {https://doi.org/10.1007/978-3-031-15512-3_9},
  doi = {10.1007/978-3-031-15512-3_9},
  abstract = {Uncertain graphs are graphs where each edge is assigned with a probability of existence. In this paper, we study the problem of hop-constrained s-t simple path enumeration in large uncertain graphs. To the best of our knowledge, we are the first to study this problem in the literature. Specifically, we propose a light-weight index to prune candidate paths by adopting the concept of probability-constrained distance. An efficient enumeration algorithm is designed based on the index structure. Experiment results on real-world datasets show that our proposed methods significantly outperform the baseline methods by up to 6 times.},
  booktitle = {Australasian Database Conference (ADC)},
  pages = {115-127},
  numpages = {13},
  keywords = {Graph, Path enumeration, Uncertain graph},
  location = {Sydney, NSW, Australia}
}
@article{xu2022deep,
  author = {Xu, Weixiao and Sun, Lin and Zhen, Cheng and Liu, Bo and Yang, Zhengyi and Yang, Wenke},
  title = {Deep Learning-Based Image Recognition of Agricultural Pests},
  journal = {Applied Sciences},
  volume = {12},
  year = {2022},
  number = {24},
  article-number = {12896},
  url = {https://www.mdpi.com/2076-3417/12/24/12896},
  issn = {2076-3417},
  abstract = {Pests and diseases are an inevitable problem in agricultural production, causing substantial economic losses yearly. The application of convolutional neural networks to the intelligent recognition of crop pest images has become increasingly popular due to advances in deep learning methods and the rise of large-scale datasets. However, the diversity and complexity of pest samples, the size of sample images, and the number of examples all directly affect the performance of convolutional neural networks. Therefore, we designed a new target-detection framework based on Cascade RCNN (Regions with CNN features), aiming to solve the problems of large image size, many pest types, and small and unbalanced numbers of samples in pest sample datasets. Specifically, this study performed data enhancement on the original samples to solve the problem of a small and unbalanced number of examples in the dataset and developed a sliding window cropping method, which could increase the perceptual field to learn sample features more accurately and in more detail without changing the original image size. Secondly, combining the attention mechanism with the FPN (Feature Pyramid Networks) layer enabled the model to learn sample features that were more important for the current task from both channel and space aspects. Compared with the current popular target-detection frameworks, the average precision value of our model (mAP@0.5) was 84.16\%, the value of (mAP@0.5:0.95) was 65.23\%, the precision was 67.79\%, and the F1 score was 82.34\%. The experiments showed that our model solved the problem of convolutional neural networks being challenging to use because of the wide variety of pest types, the large size of sample images, and the difficulty of identifying tiny pests.},
  doi = {10.3390/app122412896}
}
@article{ukey2023survey,
  author = {Ukey, Nimish and Yang*, Zhengyi and Li, Binghao and Zhang, Guangjian and Hu, Yiheng and Zhang, Wenjie},
  title = {Survey on Exact kNN Queries over High-Dimensional Data Space},
  journal = {Sensors},
  volume = {23},
  year = {2023},
  number = {2},
  article-number = {629},
  url = {https://www.mdpi.com/1424-8220/23/2/629},
  issn = {1424-8220},
  abstract = {k nearest neighbours (kNN) queries are fundamental in many applications, ranging from data mining, recommendation system and Internet of Things, to Industry 4.0 framework applications. In mining, specifically, it can be used for the classification of human activities, iterative closest point registration and pattern recognition and has also been helpful for intrusion detection systems and fault detection. Due to the importance of kNN queries, many algorithms have been proposed in the literature, for both static and dynamic data. In this paper, we focus on exact kNN queries and present a comprehensive survey of exact kNN queries. In particular, we study two fundamental types of exact kNN queries: the kNN Search queries and the kNN Join queries. Our survey focuses on exact approaches over high-dimensional data space, which covers 20 kNN Search methods and 9 kNN Join methods. To the best of our knowledge, this is the first work of a comprehensive survey of exact kNN queries over high-dimensional datasets. We specifically categorise the algorithms based on indexing strategies, data and space partitioning strategies, clustering techniques and the computing paradigm. We provide useful insights for the evolution of approaches based on the various categorisation factors, as well as the possibility of further expansion. Lastly, we discuss some open challenges and future research directions.},
  doi = {10.3390/s23020629}
}
@inproceedings{yang2023hgmatch,
  author = {Yang, Zhengyi and Zhang, Wenjie and Lin*, Xuemin and Zhang, Ying and Li, Shunyang},
  booktitle = {2023 IEEE 39th International Conference on Data Engineering (ICDE)},
  title = {HGMatch: A Match-by-Hyperedge Approach for Subgraph Matching on Hypergraphs},
  year = {2023},
  volume = {},
  number = {},
  pages = {2063-2076},
  doi = {10.1109/ICDE55515.2023.00160},
  abstract = {Hypergraphs are generalisation of graphs in which a hyperedge can connect any number of vertices. It can describe n-ary relationships and high-order information among entities compared to conventional graphs. In this paper, we study the fundamental problem of subgraph matching on hypergraphs (i.e, subhypergraph matching). Existing methods directly extend subgraph matching algorithms to the case of hypergraphs. However, this approach delays hyperedge verification and underutilises the high-order information in hypergraphs, which leads to large search space and high enumeration cost. Furthermore, with the growing size of hypergraphs, it is becoming hard to compute subhypergraph matching sequentially. Thus, we propose an efficient and parallel subhypergraph matching system, HGMatch, to handle subhypergraph matching in massive hypergraphs. We proposes a novel match-by-hyperedge framework to utilise high-order information in hypergraphs and uses set operations for efficient candidates generation. Moreover, we develop an optimised parallel execution engine in HGMatch based on the dataflow model, which features a task-based scheduler and fine-grained dynamic work stealing to achieve bounded memory execution and better load balancing. Experimental evaluation on 10 real-world datasets shows that HGMatch outperforms the extended version of the state-of-the-art subgraph matching algorithms (CFL, DAF, CECI and RapidMatch) by orders of magnitude when using a single thread, and achieves almost linear scalability when the number of threads increases.}
}
@inproceedings{hao2023efficient,
  author = {Hao, Kongzhang and Yuan, Long and Yang, Zhengyi and Zhang, Wenjie and Lin, Xuemin},
  title = {Efficient and Scalable Distributed Graph Structural Clustering at Billion Scale},
  booktitle = {International Conference on Database Systems for Advanced Applications (DASFAA)},
  year = {2023},
  publisher = {Springer Nature Switzerland},
  address = {Cham},
  pages = {234--251},
  abstract = {Structural Graph Clustering (SCAN) is a fundamental problem in graph analysis and has received considerable attention recently. Existing distributed solutions either lack efficiency or suffer from high memory consumption when addressing this problem in billion-scale graphs. Motivated by these, in this paper, we aim to devise a distributed algorithm for SCAN that is both efficient and scalable. We first propose a fine-grained clustering framework tailored for SCAN. Based on the new framework, we devise a distributed SCAN algorithm, which not only keeps a low communication overhead during execution, but also effectively reduces the memory consumption at all time. We also devise an effective workload balance mechanism that is automatically triggered by the idle machines to handle skewed workloads. The experiment results demonstrate the efficiency and scalability of our proposed algorithm.},
  isbn = {978-3-031-30675-4},
  doi = {10.1007/978-3-031-30675-4_16},
  url = {https://doi.org/10.1007/978-3-031-30675-4_16}
}
@article{ukey2023efficient,
  author = {Ukey#, Nimish and Zhang#, Guangjian and Yang*, Zhengyi and Li, Binghao and Li, Wei and Zhang, Wenjie},
  title = {Efficient continuous kNN join over dynamic high-dimensional data},
  journal = {World Wide Web},
  year = {2023},
  abstract = {Given a user dataset U and an object dataset I, a kNN join query in high-dimensional space returns the k nearest neighbors of each object in dataset U from the object dataset I. The kNN join is a basic and necessary operation in many applications, such as databases, data mining, computer vision, multi-media, machine learning, recommendation systems, and many more. In the real world, datasets frequently update dynamically as objects are added or removed. In this paper, we propose novel methods of continuous kNN join over dynamic high-dimensional data. We firstly propose the HDR+ Tree, which supports more efficient insertion, deletion, and batch update. Further observed that the existing methods rely on globally correlated datasets for effective dimensionality reduction, we then propose the HDR Forest. It clusters the dataset and constructs multiple HDR Trees to capture local correlations among the data. As a result, our HDR Forest is able to process non-globally correlated datasets efficiently. Two novel optimisations are applied to the proposed HDR Forest, including the precomputation of the PCA states of data items and pruning-based kNN recomputation during item deletion. For the completeness of the work, we also present the proof of computing distances in reduced dimensions of PCA in HDR Tree. Extensive experiments on real-world datasets show that the proposed methods and optimisations outperform the baseline algorithms of naive RkNN join and HDR Tree.},
  issn = {1573-1413},
  doi = {10.1007/s11280-023-01204-9},
  url = {https://doi.org/10.1007/s11280-023-01204-9}
}
@inproceedings{ukey2023knn,
  author = {Ukey, Nimish and Yang*, Zhengyi and Yang, Wenke and Li, Binghao and Li, Runze},
  title = {kNN Join for Dynamic High-Dimensional Data: A Parallel Approach},
  booktitle = {Australasian Database Conference (ADC)},
  year = {2023},
  publisher = {Springer Nature Switzerland},
  pages = {3--16},
  abstract = {The k nearest neighbor (kNN) join operation is a fundamental task that combines two high-dimensional databases, enabling data points in the User dataset U to identify their k nearest neighbor points from the Item dataset I. This operation plays a crucial role in various domains, including knowledge discovery, data mining, similarity search applications, and scientific research. However, exact kNN search in high-dimensional spaces is computationally demanding, and existing sequential methods face challenges in handling large datasets. In this paper, we propose an efficient parallel solution for dynamic kNN join over high-dimensional data, leveraging the high-dimensional R tree (HDR Tree) for improved efficiency. Our solution harnesses the power of Simultaneous Multi-Threading (SMT) technologies and Single-Instruction-Multiple-Data (SIMD) instructions in modern CPUs for parallelisation. Importantly, our research is the first to introduce parallel computation for exact kNN join over high-dimensional data. Experimental results demonstrate that our proposed approach outperforms the sequential HDR Tree method by up to 1.2 times with a single thread. Moreover, our solution provides near-linear scalability as the number of threads increases.},
  isbn = {978-3-031-47843-7},
  doi = {10.1007/978-3-031-47843-7_1},
  url = {https://doi.org/10.1007/978-3-031-47843-7_1}
}
@inproceedings{ma2023empirical,
  author = {Ma, Miao and Yang*, Zhengyi and Hao, Kongzhang and Chen, Liuyi and Wang, Chunling and Jin, Yi},
  title = {An Empirical Analysis of Just-in-Time Compilation in Modern Databases},
  booktitle = {Australasian Database Conference (ADC)},
  year = {2023},
  publisher = {Springer Nature Switzerland},
  pages = {227--240},
  abstract = {JIT (Just-in-Time) technology has garnered significant attention for improving the efficiency of database execution. It offers higher performance by eliminating interpretation overhead compared to traditional execution engines. LLVM serves as the primary JIT architecture, which was implemented in PostgreSQL since version 11. However, recent advancements in WASM-based databases, such as Mutable, present an alternative JIT approach. This approach minimizes the extensive engineering efforts associated with the execution engine and focuses on optimizing supported operators for lower latency and higher throughput. In this paper, we perform comprehensive experiments on these two representative open-source databases to gain deeper insights into the effectiveness of different JIT architectures.},
  isbn = {978-3-031-47843-7},
  doi = {10.1007/978-3-031-47843-7_16},
  url = {https://doi.org/10.1007/978-3-031-47843-7_16}
}
@article{chen2023machine,
  author = {Chen, Liuyi and Han, Bocheng and Wang, Xuesong and Zhao, Jiazhen and Yang, Wenke and Yang*, Zhengyi},
  title = {Machine Learning Methods in Weather and Climate Applications: A Survey},
  journal = {Applied Sciences},
  volume = {13},
  year = {2023},
  number = {21},
  article-number = {12019},
  url = {https://www.mdpi.com/2076-3417/13/21/12019},
  issn = {2076-3417},
  abstract = {With the rapid development of artificial intelligence, machine learning is gradually becoming popular for predictions in all walks of life. In meteorology, it is gradually competing with traditional climate predictions dominated by physical models. This survey aims to consolidate the current understanding of Machine Learning (ML) applications in weather and climate prediction—a field of growing importance across multiple sectors, including agriculture and disaster management. Building upon an exhaustive review of more than 20 methods highlighted in existing literature, this survey pinpointed eight techniques that show particular promise for improving the accuracy of both short-term weather and medium-to-long-term climate forecasts. According to the survey, while ML demonstrates significant capabilities in short-term weather prediction, its application in medium-to-long-term climate forecasting remains limited, constrained by factors such as intricate climate variables and data limitations. Current literature tends to focus narrowly on either short-term weather or medium-to-long-term climate forecasting, often neglecting the relationship between the two, as well as general neglect of modeling structure and recent advances. By providing an integrated analysis of models spanning different time scales, this survey aims to bridge these gaps, thereby serving as a meaningful guide for future interdisciplinary research in this rapidly evolving field.},
  doi = {10.3390/app132112019}
}
@inproceedings{zhang2023efficient,
  author = {Zhang, Wenqian and Yang*, Zhengyi and Wen, Dong and Wang, Xiaoyang},
  booktitle = {2023 IEEE International Conference on Data Mining Workshops (ICDMW)},
  title = {Efficient Distributed Core Graph Decomposition},
  year = {2023},
  volume = {},
  number = {},
  pages = {1023-1031},
  keywords = {Proteins;Filtering;Conferences;Network analyzers;Filtering algorithms;Data structures;Data mining;k-Core;Core decomposition;Distributed Computing},
  doi = {10.1109/ICDMW60847.2023.00135},
  abstract = {Core decomposition is one of the most fundamental problems in graph analytics, which is associated with numerous applications, such as community detection, protein network analysis, and system structure analysis. As the sizes of graphs are becoming increasingly large, it is challenging to compute core decomposition on a single machine. In this paper, we study the problem of k-Core decomposition in the distributed environment. Specifically, we propose the distributed Filter-Array k-Core (FAkCore) algorithm, which adopts the commonly used Scatter-Gather framework. We design an auxiliary data structure of running counts for each vertex to track the statistics of its neighbors' core number. It allows us to recompute the core number of a vertex only when the value is updated. Together with an enhanced message filtering mechanism, our method significantly reduces redundant computation and communication in the existing distributed k-Core decomposition algorithm. Experiments on 10 real-world graphs show that our method outperforms the baseline algorithms by 1.4 times on average and up to 2.2 times.}
}
@article{li2024application,
  author = {Li, Sai and Kou, Peng and Ma, Miao and Yang, Haoyu and Huang, Shuo and Yang, Zhengyi},
  journal = {IEEE Access},
  title = {Application of Semi-Supervised Learning in Image Classification: Research on Fusion of Labeled and Unlabeled Data},
  year = {2024},
  volume = {12},
  number = {},
  pages = {27331-27343},
  keywords = {Neurons;Convolutional neural networks;Medical diagnostic imaging;Image classification;Feature extraction;Training;Mathematical models;Semisupervised learning;Biomedical imaging;Quadratic programming;Quadratic neuron convolution;convolution neural network;semi-supervised learning;medical image classification},
  doi = {10.1109/ACCESS.2024.3367772},
  abstract = {Deep learning has attracted wide attention recently because of its excellent feature representation ability and end-to-end automatic learning method. Especially in clinical medical imaging diagnosis, the semi-supervised deep learning model is favored and widely used because it can make maximum use of a limited number of labeled data and combine it with a large number of unlabeled data to extract more information and knowledge from it. However, the scarcity of medical image data, the vast image size, and the instability of image quality directly affect the model's robustness, generalization, and image classification performance. Therefore, this paper proposes a new semi-supervised learning model, which uses quadratic neurons instead of traditional neurons, aiming to use quadratic convolution instead of the conventional convolution layer to improve the feature extraction ability of the model. In addition, we introduce two Dropout layers and two fully connected layers at the end of the model to enhance the nonlinear fitting ability of the network. Experiments on two large medical public data sets - ISIC 2019 and Retinopathy OCT - show that our method can improve the model's generalization performance and image classification accuracy.}
}
@article{zhang2024towards,
  author = {Zhang, Tianming and Cai, Xinwei and Chen, Lu and Yang, Zhengyi and Gao, Yunjun and Cao, Bin and Fan, Jing},
  title = {Towards efficient simulation-based constrained temporal graph pattern matching},
  journal = {World Wide Web},
  year = {2024},
  volume = {27},
  number = {3},
  pages = {22},
  abstract = {In the context of searching a single data graph G, graph pattern matching is to find all the occurrences of a pattern graph Q in G, specified by a matching rule. It is of paramount importance in many real applications such as social network analysis and cyber security, among others. A wide spectrum of studies target general graph pattern matching. However, to analyze time-relevant services such as studying the spread of diseases and detecting attack patterns, it is attractive to study inexact temporal graph pattern matching. Hence, in this paper, we propose a relaxed matching rule called constrained temporal dual simulation, and study simulation-based constrained temporal graph pattern matching which guarantees that the matching result (i) preserves the ancestor and descendant temporal connectivities; and (ii) implements edge-to-temporal path mapping. We devise a decomposition-based matching method, which first decomposes the data graph into Source Temporal Connected Components, and then performs matching on decomposed subgraphs. To speed up the matching, we define child/parent dependency relation tables and propose an efficient double hierarchical traverse strategy. Considering that the temporal graphs are naturally dynamic, we further propose update algorithms. An extensive empirical study over real-world and synthetic temporal graphs has demonstrated the effectiveness and efficiency of our approach.},
  issn = {1573-1413},
  doi = {10.1007/s11280-024-01259-2},
  url = {https://doi.org/10.1007/s11280-024-01259-2}
}
@article{li2024prototype,
  author = {Li, Lingbo and Li, Zhichun and Guo, Fusen and Yang, Haoyu and Wei, Jingtian and Yang, Zhengyi},
  journal = {IEEE Access},
  title = {Prototype Comparison Convolutional Networks for One-Shot Segmentation},
  year = {2024},
  volume = {12},
  number = {},
  pages = {54978-54990},
  keywords = {Feature extraction;Prototypes;Semantics;Finite element analysis;Semantic segmentation;Context modeling;Vectors;Convolutional neural networks;Few-shot semantic segmentation;convolutional neural network;prototype network;feature matching},
  doi = {10.1109/ACCESS.2024.3387742},
  abstract = {In few-shot semantic segmentation (FSS), the key challenges are efficiently tuning the interaction between the support set and the query set and distinguishing between context, background, and interfering items. To address these challenges, we propose prototype comparison networks for one-shot segmentation (OPCN) to capture the details required for FSS. Specifically, we offer the Fusion Interaction Module (FIM) to improve the segmentation performance by capturing the correlation and semantic information between the support set and query set features. Subsequently, we propose the Feature Enhancement Module (FEM), which aims to enhance the information required in the support set and query set features while increasing the focus on critical details by reducing the weight of the background regions to provide a more targeted feature representation for subsequent query image segmentation. Then, we propose the Feature Refinement Module (FRM) to filter irrelevant background information in the features and specify the target location region. Finally, the Feature Matching Module (FM) generates the final segmentation mask for the query image. Extensive experiments on the PASCAL-5i and COCO-20i datasets show that our approach achieves excellent performance in the case of the one-shot setup.}
}
@inproceedings{zhang2024efficient,
  author = {Zhang, Tianming and Gao, Yunjun and Zhao, Jie and Chen, Lu and Jin, Lu and Yang, Zhengyi and Cao, Bin and Fan, Jing},
  title = {Efficient Exact and Approximate Betweenness Centrality Computation for Temporal Graphs},
  year = {2024},
  isbn = {9798400701719},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3589334.3645438},
  doi = {10.1145/3589334.3645438},
  abstract = {Betweenness centrality of a vertex in a graph evaluates how often the vertex occurs in the shortest paths. It is a widely used metric of vertex importance in graph analytics. While betweenness centrality on static graphs has been extensively investigated, many real-world graphs are time-varying and modeled as temporal graphs. Examples include social networks and telecommunication networks, where a relationship between two vertices occurs at a specific time. Hence, in this paper, we target efficient methods for temporal betweenness centrality computation. We firstly propose an exact algorithm with the new notion of time instance graph, based on which, we derive a temporal dependency accumulation theory for iterative computation. To reduce the size of the time instance graph and improve the efficiency, we propose an additional optimization, which compresses the time instance graph with equivalent vertices and edges, and extends the dependency theory to the compressed graph. Since it is theoretically complex to compute temporal betweenness centrality, we further devise a probabilistically guaranteed approximate method to handle massive temporal graphs. Extensive experimental results on real-world temporal networks demonstrate the superior performance of the proposed methods. In particular, our exact and approximate methods outperform the state-of-the-art methods by up to two and five orders of magnitude, respectively.},
  booktitle = {Proceedings of the ACM on Web Conference 2024 (WWW)},
  pages = {2395-2406},
  numpages = {12},
  keywords = {betweenness centrality, temporal graph, temporal path},
  location = {Singapore, Singapore},
  series = {WWW '24}
}
@inproceedings{zhang2024tatkc,
  author = {Zhang, Tianming and Fang, Junkai and Yang, Zhengyi and Cao, Bin and Fan, Jing},
  title = {TATKC: A Temporal Graph Neural Network for Fast Approximate Temporal Katz Centrality Ranking},
  year = {2024},
  isbn = {9798400701719},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3589334.3645432},
  doi = {10.1145/3589334.3645432},
  abstract = {Numerous real-world networks are represented as temporal graphs, which capture the dynamics of connections over time. Identifying important nodes on temporal graphs has a plethora of real-life applications, such as information propagation and influential user identification, etc. Temporal Katz centrality, a popular temporal metric, gauges the importance of nodes by taking into account both the number of temporal walks and the timespan between the interactions. The computation of traditional temporal Katz centrality is computationally expensive, especially when applied to massive temporal graphs. Therefore, in this paper, we design a temporal graph neural network to approximate temporal Katz centrality computation. To the best of our knowledge, we are the first to address temporal Katz centrality computation purely from a learning-based perspective. We propose a time-injected self-attention model that consists of two phases. In the first phase, we utilize a time-injected self-attention mechanism to acquire node representations that encompass both structural information and temporal relevance. The second phase is structured as a multi-layer perceptron (MLP) which uses the learned node representation to predict node rankings. Furthermore, normalization and neighbor sampling strategies are integrated into the model to enhance its overall performance. Extensive experiments on real-world networks demonstrate the efficiency and accuracy of TATKC.},
  booktitle = {Proceedings of the ACM on Web Conference 2024 (WWW)},
  pages = {527-538},
  numpages = {12},
  keywords = {self-attention, temporal graph, temporal graph neural network, temporal katz centrality},
  location = {Singapore},
  series = {WWW '24}
}
@inproceedings{chen2024compressing,
  author = {Chen, Kaiyu and Wen, Dong and Li, Wentao and Yang, Zhengyi and Zhang, Wenjie},
  title = {On Compressing Historical Cliques in Temporal Graphs},
  booktitle = {International Conference on Database Systems for Advanced Applications (DASFAA)},
  year = {2024}
}
@inproceedings{yang2024parallel,
  author = {Yang, Wenke and Yang, Zihan and Chen, Liuyi and Yan, Ruiqing and Yang*, Zhengyi and Zhang, Linhan and Yifu, Tang},
  title = {Parallel Program Generation for Hybrid Tabular-Textual Question Answering},
  booktitle = {Asia-Pacific Web (APWeb) and Web-Age Information Management (WAIM) Joint International Conference on Web and Big Data},
  year = {2024},
  publisher = {Springer Nature Singapore},
  address = {Singapore},
  pages = {121--137},
  abstract = {Hybrid tabular-textual question answering (HTQA) involves tapping into a mosaic of data sources, traditionally managed through LSTM-based step-by-step reasoning, which has been vulnerable to exposure bias and subsequent error accumulation. This paper introduces an innovative parallel program generation method, ConcurGen, aiming to transform this paradigm by simultaneously formulating comprehensive program constructs that seamlessly blend operations and values. This approach not only rectifies the inherent pitfalls of sequential methodologies but also infuses efficiency into the process. When subjected to rigorous evaluation on benchmarks like the ConvFinQA and MultiHiertt datasets, our methodology showcased significant superiority over prevalent models such as FinQANet and MT2Net. This was evidenced by enhancements in various performance metrics, effectively raising the bar for what's deemed state-of-the-art. Notably, beyond setting these commendable benchmarks, our method facilitates a striking acceleration in program creation, achieving speeds nearly 21 times faster. Additionally, a salient feature of our approach becomes evident when numerical reasoning steps escalate: unlike traditional models, our system sustains its robust performance, emphasizing its adaptability and resilience in complex scenarios.},
  isbn = {978-981-97-7232-2},
  doi = {10.1007/978-981-97-7232-2_9}
}
@inproceedings{ding2024fgaq,
  author = {Ding, Yi and Yang*, Zhengyi and Li, Shunyang and Chen, Liuyi and Ning, Haoran and Hao, Kongzhang and Liu, Yongfei},
  title = {FGAQ: Accelerating Graph Analytical Queries Using FPGA},
  booktitle = {Asia-Pacific Web (APWeb) and Web-Age Information Management (WAIM) Joint International Conference on Web and Big Data},
  year = {2024},
  publisher = {Springer Nature Singapore},
  address = {Singapore},
  pages = {357--361},
  abstract = {Field-programmable gate arrays (FPGAs) have significant advantages in parallelism and energy efficiency over CPUs and GPUs and are widely deployed by many enterprises and cloud server providers nowadays. In this paper, we demonstrate FGAQ, an FPGA-based system for accelerating graph queries on massive graphs. FGAQ supports the two most fundamental types of graph queries, namely subgraph and path queries, and features 1) a CPU-FPGA co-designed framework, 2) a fully pipelined FPGA execution, and 3) reduced data transfer from FPGA's external memory. FGAQ provides a user-friendly interface and significantly improved performance. Performance evaluation shows that {\$}{\$}{\backslash}textsf{\{}FGAQ{\}}{\$}{\$}FGAQoutperforms the most popular graph database, Neo4j, by up to three orders of magnitude. The demo video can be found at https://www.youtube.com/watch?v=pEkzw{\_}DOQYE.},
  isbn = {978-981-97-7244-5},
  doi = {10.1007/978-981-97-7244-5_25}
}
@inproceedings{ning2024exploring,
  author = {Ning, Haoran and Han, Bocheng and Yang*, Zhengyi and Hao, Kongzhang and Ma, Miao and Wang, Chunling and Liu, Boge and Cheng, Xiaoshuang and Hao, Yu and Jin, Yi and Zhang, Wanchun and Zhang, Chengwei},
  title = {Exploring Simple Architecture of Just-in-Time Compilation in Databases},
  booktitle = {Asia-Pacific Web (APWeb) and Web-Age Information Management (WAIM) Joint International Conference on Web and Big Data},
  year = {2024},
  publisher = {Springer Nature Singapore},
  address = {Singapore},
  pages = {504--514},
  abstract = {Just-in-Time (JIT) compilation is an effective technique for enhancing query execution in modern relational databases, and it has gained increasing attention from academia and industry in recent years. However, the architectures of state-of-the-art JIT-based database systems are often complex, leading to challenges and limitations when adopted for commercial use. In this paper, we present an industrial view to JIT compilation for relational databases, emphasizing practicality and applicability. Our focus is on minimizing engineering effort, simplifying testing, and ensuring seamless integration with existing database ecosystems. We achieve these goals by adhering to three core principles: a simple, lightweight architecture; reuse of existing technologies and frameworks, particularly LLVM; and strong extensibility and compatibility. We demonstrate the feasibility and potential of this approach through an initial exploration using LLVM's mature JIT compilation capabilities to translate TPC-H database queries into optimized machine code. This proof-of-concept implementation shows the promise of our approach, pivoting the way for a comprehensive database system that leverages a lightweight yet powerful JIT compilation framework for real-world applications.},
  isbn = {978-981-97-7244-5},
  doi = {10.1007/978-981-97-7244-5_44}
}
@inproceedings{yan2024comparing,
  title = {Comparing the Career Paths of Male and Female Managers in the Construction Industry: Insights from LinkedIn Data},
  author = {Yan, Diya and Sunindijo, Riza Yosia and Wang, Cynthia C and Yang, Zhengyi},
  booktitle = {Proceedings of the 40th Annual Association of Researchers in Construction Management Conference (ARCOM)},
  year = {2024}
}
@inproceedings{qi2024hierarchical,
  author = {Luo, Qi and Zhang, Wenjie and Yang, Zhengyi and Wen, Dong and Wang, Xiaoyang and Yu, Dongxiao and Lin, Xuemin},
  title = {Hierarchical Structure Construction on Hypergraphs},
  year = {2024},
  publisher = {Association for Computing Machinery},
  booktitle = {Proceedings of the 33rd ACM International Conference on Information and Knowledge Management (CIKM)},
  series = {CIKM '24}
}
@inproceedings{yan2024approximating,
  author = {Yan, Haonan and Yang*, Zhengyi and Zhang, Tianming and Wen, Dong and Luo, Qi and Ukey, Nimish},
  title = {Approximating Temporal Katz Centrality with Monte Carlo Methods},
  booktitle = {Web and Big Data. APWeb-WAIM 2023 International Workshops},
  publisher = {Springer},
  year = {2024}
}
@inproceedings{wei2024high,
  author = {Wei, Jingtian and Yang*, Zhengyi and Luo, Qi and Zhang, Yu and Qin, Lu and Zhang, Wenjie},
  title = {High-Order Local Clustering on Hypergraphs},
  booktitle = {13th International Conference on Health Information Science (HIS)},
  year = {2024}
}
@inproceedings{ukey2024cluster,
  author = {Ukey#, Nimish and Zhang#, Guangjian and Yang*, Zhengyi and Wang, Xiaoyang and Li, Binghao and Saydam, Serkan and Zhang, Wenjie},
  title = {A Cluster-Based Approach to kNN Join over Batch-Dynamic High-Dimensional Data},
  booktitle = {International Conference Advanced Data Mining and Applications (ADMA)},
  year = {2024}
}

This file was generated by bibtex2html 1.99.